From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/crypto/CMakeLists.txt | 12 + src/crypto/crypto_accel.h | 37 + src/crypto/crypto_plugin.h | 36 + src/crypto/isa-l/CMakeLists.txt | 36 + src/crypto/isa-l/isa-l_crypto/.gitignore | 27 + src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md | 39 + src/crypto/isa-l/isa-l_crypto/Doxyfile | 31 + src/crypto/isa-l/isa-l_crypto/LICENSE | 26 + src/crypto/isa-l/isa-l_crypto/Makefile.am | 161 + src/crypto/isa-l/isa-l_crypto/Makefile.nmake | 493 +++ src/crypto/isa-l/isa-l_crypto/Makefile.unx | 50 + src/crypto/isa-l/isa-l_crypto/README.md | 63 + src/crypto/isa-l/isa-l_crypto/Release_notes.txt | 215 + src/crypto/isa-l/isa-l_crypto/aes/Makefile.am | 170 + .../isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm | 1778 ++++++++ .../aes/XTS_AES_128_dec_expanded_key_avx.asm | 1748 ++++++++ .../aes/XTS_AES_128_dec_expanded_key_sse.asm | 1747 ++++++++ .../aes/XTS_AES_128_dec_expanded_key_vaes.asm | 1648 ++++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm | 1779 ++++++++ .../isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm | 1681 ++++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm | 1531 +++++++ .../aes/XTS_AES_128_enc_expanded_key_avx.asm | 1506 +++++++ .../aes/XTS_AES_128_enc_expanded_key_sse.asm | 1505 +++++++ .../aes/XTS_AES_128_enc_expanded_key_vaes.asm | 1473 +++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm | 1530 +++++++ .../isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm | 1498 +++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm | 1962 +++++++++ .../aes/XTS_AES_256_dec_expanded_key_avx.asm | 1896 +++++++++ .../aes/XTS_AES_256_dec_expanded_key_sse.asm | 1898 +++++++++ .../aes/XTS_AES_256_dec_expanded_key_vaes.asm | 1808 ++++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm | 1963 +++++++++ .../isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm | 1875 +++++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm | 1708 ++++++++ .../aes/XTS_AES_256_enc_expanded_key_avx.asm | 1653 ++++++++ .../aes/XTS_AES_256_enc_expanded_key_sse.asm | 1652 ++++++++ .../aes/XTS_AES_256_enc_expanded_key_vaes.asm | 1634 ++++++++ .../isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm | 1708 ++++++++ .../isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm | 1687 ++++++++ .../aes/aarch64/aes_gcm_aes_finalize_128.S | 215 + .../aes/aarch64/aes_gcm_aes_finalize_256.S | 220 + .../isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S | 161 + .../isa-l_crypto/aes/aarch64/aes_gcm_consts.S | 140 + .../isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S | 30 + .../isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S | 30 + .../isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S | 30 + .../isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S | 30 + .../isa-l_crypto/aes/aarch64/aes_gcm_update_128.S | 32 + .../isa-l_crypto/aes/aarch64/aes_gcm_update_256.S | 32 + .../aes/aarch64/cbc_aarch64_dispatcher.c | 108 + .../isa-l/isa-l_crypto/aes/aarch64/cbc_common.S | 54 + .../isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S | 482 +++ .../isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S | 157 + .../aes/aarch64/cbc_multibinary_aarch64.S | 38 + .../aes/aarch64/gcm_aarch64_dispatcher.c | 255 ++ .../isa-l/isa-l_crypto/aes/aarch64/gcm_common.S | 430 ++ .../isa-l_crypto/aes/aarch64/gcm_common_128.S | 165 + .../isa-l_crypto/aes/aarch64/gcm_common_256.S | 181 + .../isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S | 588 +++ .../aes/aarch64/gcm_multibinary_aarch64.S | 58 + .../isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S | 83 + .../isa-l/isa-l_crypto/aes/aarch64/gcm_update.S | 277 ++ .../aes/aarch64/keyexp_128_aarch64_aes.S | 134 + .../aes/aarch64/keyexp_192_aarch64_aes.S | 136 + .../aes/aarch64/keyexp_256_aarch64_aes.S | 153 + .../aes/aarch64/keyexp_aarch64_dispatcher.c | 72 + .../aes/aarch64/keyexp_multibinary_aarch64.S | 35 + .../aes/aarch64/xts_aarch64_dispatcher.c | 102 + .../isa-l_crypto/aes/aarch64/xts_aes_128_common.S | 214 + .../isa-l_crypto/aes/aarch64/xts_aes_128_dec.S | 116 + .../isa-l_crypto/aes/aarch64/xts_aes_128_enc.S | 91 + .../isa-l_crypto/aes/aarch64/xts_aes_256_common.S | 247 ++ .../isa-l_crypto/aes/aarch64/xts_aes_256_dec.S | 116 + .../isa-l_crypto/aes/aarch64/xts_aes_256_enc.S | 88 + .../isa-l_crypto/aes/aarch64/xts_aes_common.S | 232 ++ .../aes/aarch64/xts_keyexp_aes_128_dec.S | 49 + .../aes/aarch64/xts_keyexp_aes_128_enc.S | 49 + .../aes/aarch64/xts_keyexp_aes_256_dec.S | 49 + .../aes/aarch64/xts_keyexp_aes_256_enc.S | 49 + .../aes/aarch64/xts_multibinary_aarch64.S | 39 + src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm | 377 ++ src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm | 431 ++ .../isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm | 162 + .../isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm | 162 + .../isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm | 164 + .../isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm | 158 + .../isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm | 161 + .../isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm | 158 + .../isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm | 519 +++ .../isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm | 137 + .../isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm | 151 + .../isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm | 149 + .../isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm | 147 + .../isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm | 141 + .../isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm | 148 + .../isa-l/isa-l_crypto/aes/cbc_multibinary.asm | 102 + src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c | 339 ++ src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c | 56 + .../isa-l/isa-l_crypto/aes/cbc_std_vectors.h | 466 +++ .../isa-l_crypto/aes/cbc_std_vectors_random_test.c | 443 ++ .../isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c | 183 + src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm | 202 + .../isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm | 33 + .../isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm | 33 + src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm | 33 + .../isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm | 32 + .../isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm | 33 + .../isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm | 33 + .../isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm | 33 + src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm | 31 + .../isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm | 33 + .../isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm | 32 + .../isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm | 33 + src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm | 2130 ++++++++++ src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm | 3277 +++++++++++++++ src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm | 291 ++ .../isa-l_crypto/aes/gcm_keys_vaes_avx512.asm | 233 ++ .../isa-l/isa-l_crypto/aes/gcm_multibinary.asm | 184 + .../isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm | 118 + .../isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c | 2038 ++++++++++ .../isa-l_crypto/aes/gcm_nt_std_vectors_test.c | 322 ++ src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c | 272 ++ src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c | 61 + .../isa-l/isa-l_crypto/aes/gcm_simple_example.c | 78 + src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm | 2171 ++++++++++ .../isa-l_crypto/aes/gcm_std_vectors_random_test.c | 1940 +++++++++ .../isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c | 659 +++ .../isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm | 4296 ++++++++++++++++++++ src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h | 476 +++ src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm | 328 ++ src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm | 274 ++ src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm | 286 ++ .../isa-l/isa-l_crypto/aes/keyexp_multibinary.asm | 68 + src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h | 302 ++ .../isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c | 143 + .../isa-l/isa-l_crypto/aes/xts_128_dec_perf.c | 125 + .../isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c | 144 + .../isa-l/isa-l_crypto/aes/xts_128_enc_perf.c | 123 + .../isa-l_crypto/aes/xts_128_expanded_key_test.c | 116 + src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c | 247 ++ .../isa-l_crypto/aes/xts_128_rand_ossl_test.c | 271 ++ src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c | 106 + src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h | 1691 ++++++++ .../isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c | 145 + .../isa-l/isa-l_crypto/aes/xts_256_dec_perf.c | 126 + .../isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c | 145 + .../isa-l/isa-l_crypto/aes/xts_256_enc_perf.c | 124 + .../isa-l_crypto/aes/xts_256_expanded_key_test.c | 113 + src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c | 249 ++ .../isa-l_crypto/aes/xts_256_rand_ossl_test.c | 273 ++ src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c | 105 + src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h | 1035 +++++ .../isa-l_crypto/aes/xts_aes_128_multibinary.asm | 78 + .../isa-l_crypto/aes/xts_aes_256_multibinary.asm | 78 + src/crypto/isa-l/isa-l_crypto/autogen.sh | 17 + src/crypto/isa-l/isa-l_crypto/configure.ac | 349 ++ .../isa-l_crypto/examples/saturation_test/Makefile | 27 + .../examples/saturation_test/README.txt | 25 + .../examples/saturation_test/aes_thread.c | 380 ++ .../saturation_test/isal_multithread_perf.c | 206 + .../saturation_test/isal_multithread_perf.h | 52 + .../examples/saturation_test/md5_thread.c | 213 + .../examples/saturation_test/sha1_thread.c | 20 + .../examples/saturation_test/sha256_thread.c | 20 + .../examples/saturation_test/sha512_thread.c | 20 + .../isa-l_crypto/include/aarch64_multibinary.h | 301 ++ src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h | 165 + src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h | 613 +++ src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h | 76 + src/crypto/isa-l/isa-l_crypto/include/aes_xts.h | 214 + .../isa-l/isa-l_crypto/include/datastruct.asm | 79 + .../isa-l/isa-l_crypto/include/endian_helper.h | 83 + src/crypto/isa-l/isa-l_crypto/include/intrinreg.h | 65 + src/crypto/isa-l/isa-l_crypto/include/md5_mb.h | 372 ++ src/crypto/isa-l/isa-l_crypto/include/memcpy.asm | 615 +++ .../isa-l/isa-l_crypto/include/memcpy_inline.h | 375 ++ src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h | 315 ++ .../isa-l_crypto/include/mh_sha1_murmur3_x64_128.h | 327 ++ src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h | 315 ++ .../isa-l/isa-l_crypto/include/multi_buffer.h | 112 + .../isa-l/isa-l_crypto/include/multibinary.asm | 517 +++ .../isa-l/isa-l_crypto/include/reg_sizes.asm | 442 ++ .../isa-l/isa-l_crypto/include/rolling_hashx.h | 114 + src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h | 450 ++ src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h | 451 ++ src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h | 422 ++ src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h | 155 + src/crypto/isa-l/isa-l_crypto/include/test.h | 111 + src/crypto/isa-l/isa-l_crypto/include/types.h | 100 + src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def | 80 + src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in | 11 + src/crypto/isa-l/isa-l_crypto/make.inc | 340 ++ src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am | 98 + .../md5_mb/aarch64/md5_ctx_aarch64_asimd.c | 230 ++ .../md5_mb/aarch64/md5_mb_aarch64_dispatcher.c | 59 + .../isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S | 248 ++ .../isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S | 526 +++ .../md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c | 187 + .../md5_mb/aarch64/md5_mb_multibinary.S | 36 + src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c | 263 ++ .../isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c | 263 ++ .../isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c | 267 ++ .../isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c | 291 ++ .../isa-l_crypto/md5_mb/md5_ctx_base_aliases.c | 50 + src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c | 249 ++ src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm | 55 + .../isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm | 73 + .../isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm | 248 ++ .../isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm | 255 ++ .../md5_mb/md5_mb_mgr_flush_avx512.asm | 315 ++ .../isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm | 249 ++ .../isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c | 41 + .../isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c | 44 + .../isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c | 41 + .../isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm | 228 ++ .../isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm | 239 ++ .../md5_mb/md5_mb_mgr_submit_avx512.asm | 283 ++ .../isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm | 229 ++ .../isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c | 159 + .../isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c | 202 + .../isa-l_crypto/md5_mb/md5_mb_rand_update_test.c | 297 ++ src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c | 229 ++ .../isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c | 129 + .../isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm | 853 ++++ .../isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm | 783 ++++ .../isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm | 779 ++++ .../isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm | 920 +++++ .../isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm | 80 + src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c | 186 + src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am | 83 + .../mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c | 55 + .../isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c | 53 + .../mh_sha1/aarch64/mh_sha1_block_asimd.S | 124 + .../mh_sha1/aarch64/mh_sha1_block_ce.S | 384 ++ .../isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c | 53 + .../mh_sha1/aarch64/mh_sha1_multibinary.S | 35 + .../mh_sha1/aarch64/sha1_asimd_common.S | 269 ++ src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c | 141 + .../isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c | 70 + .../isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c | 40 + .../isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm | 506 +++ .../isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm | 508 +++ .../isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm | 406 ++ .../isa-l_crypto/mh_sha1/mh_sha1_block_base.c | 387 ++ .../isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm | 498 +++ .../isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c | 122 + .../isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h | 308 ++ .../isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm | 77 + .../isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c | 180 + .../isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c | 430 ++ .../isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c | 217 + .../isa-l_crypto/mh_sha1/mh_sha1_update_base.c | 110 + .../isa-l_crypto/mh_sha1/mh_sha1_update_test.c | 240 ++ .../isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c | 204 + .../mh_sha1_murmur3_x64_128/Makefile.am | 89 + .../aarch64/mh_sha1_murmur3_aarch64_dispatcher.c | 53 + .../aarch64/mh_sha1_murmur3_aarch64_internal.h | 91 + .../aarch64/mh_sha1_murmur3_asimd.c | 54 + .../aarch64/mh_sha1_murmur3_block_asimd.S | 224 + .../aarch64/mh_sha1_murmur3_block_ce.S | 482 +++ .../aarch64/mh_sha1_murmur3_ce.c | 54 + .../aarch64/mh_sha1_murmur3_multibinary.S | 34 + .../aarch64/sha1_asimd_common.S | 271 ++ .../mh_sha1_murmur3_x64_128.c | 154 + .../mh_sha1_murmur3_x64_128_avx512.c | 67 + .../mh_sha1_murmur3_x64_128_base_aliases.c | 43 + .../mh_sha1_murmur3_x64_128_block_avx.asm | 706 ++++ .../mh_sha1_murmur3_x64_128_block_avx2.asm | 653 +++ .../mh_sha1_murmur3_x64_128_block_avx512.asm | 504 +++ .../mh_sha1_murmur3_x64_128_block_sse.asm | 702 ++++ .../mh_sha1_murmur3_x64_128_finalize_base.c | 102 + .../mh_sha1_murmur3_x64_128_internal.h | 202 + .../mh_sha1_murmur3_x64_128_multibinary.asm | 76 + .../mh_sha1_murmur3_x64_128_perf.c | 206 + .../mh_sha1_murmur3_x64_128_test.c | 248 ++ .../mh_sha1_murmur3_x64_128_update_base.c | 107 + .../mh_sha1_murmur3_x64_128_update_test.c | 272 ++ .../mh_sha1_murmur3_x64_128/murmur3_x64_128.c | 85 + .../murmur3_x64_128_internal.c | 138 + .../isa-l/isa-l_crypto/mh_sha256/Makefile.am | 88 + .../aarch64/mh_sha256_aarch64_dispatcher.c | 49 + .../mh_sha256/aarch64/mh_sha256_block_ce.S | 731 ++++ .../isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c | 53 + .../mh_sha256/aarch64/mh_sha256_multibinary.S | 35 + .../isa-l/isa-l_crypto/mh_sha256/mh_sha256.c | 143 + .../isa-l_crypto/mh_sha256/mh_sha256_avx512.c | 70 + .../mh_sha256/mh_sha256_base_aliases.c | 40 + .../isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm | 557 +++ .../mh_sha256/mh_sha256_block_avx2.asm | 616 +++ .../mh_sha256/mh_sha256_block_avx512.asm | 682 ++++ .../isa-l_crypto/mh_sha256/mh_sha256_block_base.c | 188 + .../isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm | 557 +++ .../mh_sha256/mh_sha256_finalize_base.c | 121 + .../isa-l_crypto/mh_sha256/mh_sha256_internal.h | 318 ++ .../mh_sha256/mh_sha256_multibinary.asm | 77 + .../isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c | 180 + .../isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c | 410 ++ .../isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c | 217 + .../isa-l_crypto/mh_sha256/mh_sha256_update_base.c | 110 + .../isa-l_crypto/mh_sha256/mh_sha256_update_test.c | 240 ++ .../isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c | 176 + .../isa-l/isa-l_crypto/rolling_hash/Makefile.am | 57 + .../aarch64/rolling_hash2_aarch64_dispatcher.c | 37 + .../aarch64/rolling_hash2_aarch64_multibinary.S | 35 + .../aarch64/rolling_hash2_run_until_unroll.S | 115 + .../rolling_hash/chunking_with_mb_hash.c | 222 + .../isa-l_crypto/rolling_hash/rolling_hash2.c | 169 + .../rolling_hash/rolling_hash2_base_aliases.c | 39 + .../rolling_hash/rolling_hash2_multibinary.asm | 122 + .../isa-l_crypto/rolling_hash/rolling_hash2_perf.c | 120 + .../rolling_hash/rolling_hash2_table.h | 296 ++ .../isa-l_crypto/rolling_hash/rolling_hash2_test.c | 314 ++ .../rolling_hash/rolling_hash2_until_00.asm | 204 + .../rolling_hash/rolling_hash2_until_04.asm | 203 + .../isa-l_crypto/rolling_hash/rolling_hashx_base.c | 65 + src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am | 130 + .../isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S | 294 ++ .../sha1_mb/aarch64/sha1_asimd_common.S | 269 ++ .../isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c | 250 ++ .../isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c | 250 ++ .../sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c | 93 + .../sha1_mb/aarch64/sha1_mb_asimd_x4.S | 192 + .../sha1_mb/aarch64/sha1_mb_mgr_asimd.c | 217 + .../isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c | 208 + .../sha1_mb/aarch64/sha1_mb_multibinary.S | 36 + .../isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S | 194 + .../isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S | 253 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c | 265 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c | 264 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c | 271 ++ .../isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c | 281 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c | 325 ++ .../isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c | 54 + .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c | 251 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c | 259 ++ src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm | 67 + .../isa-l_crypto/sha1_mb/sha1_mb_flush_test.c | 146 + .../sha1_mb/sha1_mb_mgr_datastruct.asm | 74 + .../isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm | 247 ++ .../sha1_mb/sha1_mb_mgr_flush_avx2.asm | 273 ++ .../sha1_mb/sha1_mb_mgr_flush_avx512.asm | 271 ++ .../sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm | 278 ++ .../isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm | 249 ++ .../sha1_mb/sha1_mb_mgr_flush_sse_ni.asm | 256 ++ .../isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c | 41 + .../isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c | 41 + .../isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c | 41 + .../sha1_mb/sha1_mb_mgr_submit_avx.asm | 246 ++ .../sha1_mb/sha1_mb_mgr_submit_avx2.asm | 250 ++ .../sha1_mb/sha1_mb_mgr_submit_avx512.asm | 248 ++ .../sha1_mb/sha1_mb_mgr_submit_sse.asm | 246 ++ .../sha1_mb/sha1_mb_mgr_submit_sse_ni.asm | 290 ++ .../isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c | 159 + .../isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c | 202 + .../sha1_mb/sha1_mb_rand_update_test.c | 297 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c | 233 ++ .../isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c | 128 + .../sha1_mb/sha1_mb_vs_ossl_shortage_perf.c | 132 + .../isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm | 563 +++ .../isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm | 416 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm | 413 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm | 518 +++ .../sha1_mb/sha1_multi_buffer_example.c | 112 + .../isa-l_crypto/sha1_mb/sha1_multibinary.asm | 131 + .../isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm | 318 ++ .../isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm | 484 +++ .../isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm | 485 +++ src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c | 220 + .../isa-l/isa-l_crypto/sha256_mb/Makefile.am | 127 + .../isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c | 256 ++ .../aarch64/sha256_mb_aarch64_dispatcher.c | 59 + .../sha256_mb/aarch64/sha256_mb_mgr_ce.c | 254 ++ .../sha256_mb/aarch64/sha256_mb_multibinary.S | 36 + .../sha256_mb/aarch64/sha256_mb_x1_ce.S | 238 ++ .../sha256_mb/aarch64/sha256_mb_x2_ce.S | 289 ++ .../sha256_mb/aarch64/sha256_mb_x3_ce.S | 342 ++ .../sha256_mb/aarch64/sha256_mb_x4_ce.S | 380 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c | 268 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c | 268 ++ .../isa-l_crypto/sha256_mb/sha256_ctx_avx512.c | 273 ++ .../isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c | 283 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c | 301 ++ .../sha256_mb/sha256_ctx_base_aliases.c | 54 + .../isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c | 256 ++ .../isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c | 262 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_job.asm | 65 + .../isa-l_crypto/sha256_mb/sha256_mb_flush_test.c | 146 + .../sha256_mb/sha256_mb_mgr_datastruct.asm | 74 + .../sha256_mb/sha256_mb_mgr_flush_avx.asm | 253 ++ .../sha256_mb/sha256_mb_mgr_flush_avx2.asm | 274 ++ .../sha256_mb/sha256_mb_mgr_flush_avx512.asm | 288 ++ .../sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm | 295 ++ .../sha256_mb/sha256_mb_mgr_flush_sse.asm | 254 ++ .../sha256_mb/sha256_mb_mgr_flush_sse_ni.asm | 261 ++ .../sha256_mb/sha256_mb_mgr_init_avx2.c | 41 + .../sha256_mb/sha256_mb_mgr_init_avx512.c | 41 + .../sha256_mb/sha256_mb_mgr_init_sse.c | 41 + .../sha256_mb/sha256_mb_mgr_submit_avx.asm | 260 ++ .../sha256_mb/sha256_mb_mgr_submit_avx2.asm | 246 ++ .../sha256_mb/sha256_mb_mgr_submit_avx512.asm | 261 ++ .../sha256_mb/sha256_mb_mgr_submit_sse.asm | 261 ++ .../sha256_mb/sha256_mb_mgr_submit_sse_ni.asm | 301 ++ .../sha256_mb/sha256_mb_rand_ssl_test.c | 160 + .../isa-l_crypto/sha256_mb/sha256_mb_rand_test.c | 203 + .../sha256_mb/sha256_mb_rand_update_test.c | 300 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c | 241 ++ .../sha256_mb/sha256_mb_vs_ossl_perf.c | 129 + .../sha256_mb/sha256_mb_vs_ossl_shortage_perf.c | 132 + .../sha256_mb/sha256_mb_x16_avx512.asm | 930 +++++ .../isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm | 431 ++ .../isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm | 426 ++ .../isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm | 620 +++ .../isa-l_crypto/sha256_mb/sha256_multibinary.asm | 125 + .../isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm | 361 ++ .../isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm | 574 +++ .../isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm | 567 +++ .../isa-l/isa-l_crypto/sha256_mb/sha256_ref.c | 204 + .../isa-l/isa-l_crypto/sha512_mb/Makefile.am | 108 + .../isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c | 256 ++ .../aarch64/sha512_mb_aarch64_dispatcher.c | 59 + .../sha512_mb/aarch64/sha512_mb_mgr_ce.c | 210 + .../sha512_mb/aarch64/sha512_mb_multibinary.S | 36 + .../sha512_mb/aarch64/sha512_mb_x1_ce.S | 269 ++ .../sha512_mb/aarch64/sha512_mb_x2_ce.S | 390 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c | 269 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c | 269 ++ .../isa-l_crypto/sha512_mb/sha512_ctx_avx512.c | 274 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c | 323 ++ .../sha512_mb/sha512_ctx_base_aliases.c | 54 + .../isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c | 255 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c | 255 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_job.asm | 54 + .../sha512_mb/sha512_mb_mgr_datastruct.asm | 72 + .../sha512_mb/sha512_mb_mgr_flush_avx.asm | 224 + .../sha512_mb/sha512_mb_mgr_flush_avx2.asm | 245 ++ .../sha512_mb/sha512_mb_mgr_flush_avx512.asm | 270 ++ .../sha512_mb/sha512_mb_mgr_flush_sse.asm | 227 ++ .../sha512_mb/sha512_mb_mgr_init_avx2.c | 45 + .../sha512_mb/sha512_mb_mgr_init_avx512.c | 42 + .../sha512_mb/sha512_mb_mgr_init_sse.c | 43 + .../sha512_mb/sha512_mb_mgr_submit_avx.asm | 262 ++ .../sha512_mb/sha512_mb_mgr_submit_avx2.asm | 270 ++ .../sha512_mb/sha512_mb_mgr_submit_avx512.asm | 280 ++ .../sha512_mb/sha512_mb_mgr_submit_sse.asm | 260 ++ .../sha512_mb/sha512_mb_rand_ssl_test.c | 160 + .../isa-l_crypto/sha512_mb/sha512_mb_rand_test.c | 203 + .../sha512_mb/sha512_mb_rand_update_test.c | 300 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c | 270 ++ .../sha512_mb/sha512_mb_vs_ossl_perf.c | 129 + .../isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm | 442 ++ .../isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm | 424 ++ .../isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm | 487 +++ .../isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm | 644 +++ .../isa-l_crypto/sha512_mb/sha512_multibinary.asm | 252 ++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ref.c | 234 ++ .../sha512_mb/sha512_sb_mgr_flush_sse4.c | 46 + .../sha512_mb/sha512_sb_mgr_init_sse4.c | 38 + .../sha512_mb/sha512_sb_mgr_submit_sse4.c | 65 + .../isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm | 396 ++ src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am | 121 + .../sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c | 65 + .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S | 387 ++ .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S | 576 +++ .../sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c | 246 ++ .../sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c | 241 ++ .../sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c | 188 + .../sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c | 250 ++ .../sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S | 36 + .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S | 237 ++ .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S | 344 ++ .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S | 368 ++ .../isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S | 440 ++ .../isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c | 284 ++ .../isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c | 292 ++ .../isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c | 314 ++ .../isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c | 54 + src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm | 65 + .../isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c | 145 + .../isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm | 77 + .../isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm | 258 ++ .../sm3_mb/sm3_mb_mgr_flush_avx512.asm | 276 ++ .../isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm | 247 ++ .../sm3_mb/sm3_mb_mgr_submit_avx512.asm | 273 ++ .../isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c | 160 + .../isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c | 206 + .../isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c | 298 ++ src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c | 250 ++ .../isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c | 128 + .../sm3_mb/sm3_mb_vs_ossl_shortage_perf.c | 133 + .../isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm | 1035 +++++ .../isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm | 711 ++++ .../isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm | 81 + .../isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c | 207 + .../isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c | 45 + .../isa-l/isa-l_crypto/tests/extended/Makefile | 19 + .../isa-l_crypto/tests/extended/Makefile.nmake | 58 + .../tests/extended/md5_mb_over_4GB_test.c | 155 + .../tests/extended/sha1_mb_over_4GB_test.c | 156 + .../tests/extended/sha256_mb_over_4GB_test.c | 156 + .../tests/extended/sha512_mb_over_4GB_test.c | 156 + .../tests/extended/sm3_mb_over_4GB_test.c | 162 + .../isa-l/isa-l_crypto/tools/check_format.sh | 87 + src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk | 123 + src/crypto/isa-l/isa-l_crypto/tools/iindent | 2 + .../isa-l/isa-l_crypto/tools/nasm-cet-filter.sh | 56 + src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh | 47 + .../tools/remove_trailing_whitespace.sh | 2 + .../isa-l/isa-l_crypto/tools/test_autorun.sh | 63 + src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh | 73 + .../isa-l/isa-l_crypto/tools/test_extended.sh | 127 + src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh | 11 + .../isa-l/isa-l_crypto/tools/yasm-cet-filter.sh | 47 + src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh | 38 + src/crypto/isa-l/isal_crypto_accel.cc | 43 + src/crypto/isa-l/isal_crypto_accel.h | 31 + src/crypto/isa-l/isal_crypto_plugin.cc | 34 + src/crypto/isa-l/isal_crypto_plugin.h | 47 + src/crypto/openssl/CMakeLists.txt | 14 + src/crypto/openssl/openssl_crypto_accel.cc | 104 + src/crypto/openssl/openssl_crypto_accel.h | 32 + src/crypto/openssl/openssl_crypto_plugin.cc | 32 + src/crypto/openssl/openssl_crypto_plugin.h | 36 + src/crypto/qat/CMakeLists.txt | 20 + src/crypto/qat/qat_crypto_accel.cc | 42 + src/crypto/qat/qat_crypto_accel.h | 35 + src/crypto/qat/qat_crypto_plugin.cc | 35 + src/crypto/qat/qat_crypto_plugin.h | 42 + src/crypto/qat/qcccrypto.cc | 471 +++ src/crypto/qat/qcccrypto.h | 176 + 533 files changed, 162071 insertions(+) create mode 100644 src/crypto/CMakeLists.txt create mode 100644 src/crypto/crypto_accel.h create mode 100644 src/crypto/crypto_plugin.h create mode 100644 src/crypto/isa-l/CMakeLists.txt create mode 100644 src/crypto/isa-l/isa-l_crypto/.gitignore create mode 100644 src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md create mode 100644 src/crypto/isa-l/isa-l_crypto/Doxyfile create mode 100644 src/crypto/isa-l/isa-l_crypto/LICENSE create mode 100644 src/crypto/isa-l/isa-l_crypto/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/Makefile.nmake create mode 100644 src/crypto/isa-l/isa-l_crypto/Makefile.unx create mode 100644 src/crypto/isa-l/isa-l_crypto/README.md create mode 100644 src/crypto/isa-l/isa-l_crypto/Release_notes.txt create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm create mode 100755 src/crypto/isa-l/isa-l_crypto/autogen.sh create mode 100644 src/crypto/isa-l/isa-l_crypto/configure.ac create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c create mode 100644 src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c create mode 100644 src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/aes_xts.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/include/endian_helper.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/intrinreg.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/md5_mb.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/memcpy.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/test.h create mode 100644 src/crypto/isa-l/isa-l_crypto/include/types.h create mode 100644 src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def create mode 100644 src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in create mode 100644 src/crypto/isa-l/isa-l_crypto/make.inc create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/check_format.sh create mode 100644 src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/iindent create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh create mode 100755 src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh create mode 100644 src/crypto/isa-l/isal_crypto_accel.cc create mode 100644 src/crypto/isa-l/isal_crypto_accel.h create mode 100644 src/crypto/isa-l/isal_crypto_plugin.cc create mode 100644 src/crypto/isa-l/isal_crypto_plugin.h create mode 100644 src/crypto/openssl/CMakeLists.txt create mode 100644 src/crypto/openssl/openssl_crypto_accel.cc create mode 100644 src/crypto/openssl/openssl_crypto_accel.h create mode 100644 src/crypto/openssl/openssl_crypto_plugin.cc create mode 100644 src/crypto/openssl/openssl_crypto_plugin.h create mode 100644 src/crypto/qat/CMakeLists.txt create mode 100644 src/crypto/qat/qat_crypto_accel.cc create mode 100644 src/crypto/qat/qat_crypto_accel.h create mode 100644 src/crypto/qat/qat_crypto_plugin.cc create mode 100644 src/crypto/qat/qat_crypto_plugin.h create mode 100644 src/crypto/qat/qcccrypto.cc create mode 100644 src/crypto/qat/qcccrypto.h (limited to 'src/crypto') diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt new file mode 100644 index 000000000..f930886d3 --- /dev/null +++ b/src/crypto/CMakeLists.txt @@ -0,0 +1,12 @@ +add_custom_target(crypto_plugins) +set(crypto_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/crypto) + +add_subdirectory(openssl) + +if(HAVE_INTEL AND HAVE_NASM_X64_AVX2 AND (NOT APPLE)) + add_subdirectory(isa-l) +endif() + +if(WITH_QAT) + add_subdirectory(qat) +endif() diff --git a/src/crypto/crypto_accel.h b/src/crypto/crypto_accel.h new file mode 100644 index 000000000..5c1593609 --- /dev/null +++ b/src/crypto/crypto_accel.h @@ -0,0 +1,37 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CRYPTO_ACCEL_H +#define CRYPTO_ACCEL_H +#include +#include "include/Context.h" + +class CryptoAccel; +typedef std::shared_ptr CryptoAccelRef; + +class CryptoAccel { + public: + CryptoAccel() {} + virtual ~CryptoAccel() {} + + static const int AES_256_IVSIZE = 128/8; + static const int AES_256_KEYSIZE = 256/8; + virtual bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) = 0; + virtual bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) = 0; +}; +#endif diff --git a/src/crypto/crypto_plugin.h b/src/crypto/crypto_plugin.h new file mode 100644 index 000000000..cf22d5cb4 --- /dev/null +++ b/src/crypto/crypto_plugin.h @@ -0,0 +1,36 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CRYPTO_PLUGIN_H +#define CRYPTO_PLUGIN_H + +// ----------------------------------------------------------------------------- +#include "common/PluginRegistry.h" +#include "ostream" + +#include "crypto/crypto_accel.h" +// ----------------------------------------------------------------------------- + +class CryptoPlugin : public ceph::Plugin { + +public: + CryptoAccelRef cryptoaccel; + explicit CryptoPlugin(CephContext* cct) : Plugin(cct) + {} + ~CryptoPlugin() + {} + virtual int factory(CryptoAccelRef *cs, + std::ostream *ss) = 0; +}; +#endif diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt new file mode 100644 index 000000000..2a2ec0bc0 --- /dev/null +++ b/src/crypto/isa-l/CMakeLists.txt @@ -0,0 +1,36 @@ +set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto) +set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}") + +set(isal_crypto_plugin_srcs + isal_crypto_accel.cc + isal_crypto_plugin.cc + ${isal_dir}/aes/cbc_pre.c + ${isal_dir}/aes/cbc_multibinary.asm + ${isal_dir}/aes/keyexp_128.asm + ${isal_dir}/aes/keyexp_192.asm + ${isal_dir}/aes/keyexp_256.asm + ${isal_dir}/aes/keyexp_multibinary.asm + ${isal_dir}/aes/cbc_dec_128_x4_sse.asm + ${isal_dir}/aes/cbc_dec_128_x8_avx.asm + ${isal_dir}/aes/cbc_dec_192_x4_sse.asm + ${isal_dir}/aes/cbc_dec_192_x8_avx.asm + ${isal_dir}/aes/cbc_dec_256_x4_sse.asm + ${isal_dir}/aes/cbc_dec_256_x8_avx.asm + ${isal_dir}/aes/cbc_enc_128_x4_sb.asm + ${isal_dir}/aes/cbc_enc_128_x8_sb.asm + ${isal_dir}/aes/cbc_enc_192_x4_sb.asm + ${isal_dir}/aes/cbc_enc_192_x8_sb.asm + ${isal_dir}/aes/cbc_enc_256_x4_sb.asm + ${isal_dir}/aes/cbc_enc_256_x8_sb.asm) + +if(HAVE_NASM_X64) +add_dependencies(crypto_plugins ceph_crypto_isal) +endif(HAVE_NASM_X64) + +add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs}) +target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include) +set_target_properties(ceph_crypto_isal PROPERTIES + VERSION 1.0.0 + SOVERSION 1 + INSTALL_RPATH "") +install(TARGETS ceph_crypto_isal DESTINATION ${crypto_plugin_dir}) diff --git a/src/crypto/isa-l/isa-l_crypto/.gitignore b/src/crypto/isa-l/isa-l_crypto/.gitignore new file mode 100644 index 000000000..5d7ff17ad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/.gitignore @@ -0,0 +1,27 @@ +# Objects +*~ +*.o +*.lo +*.so +*.dll +*.exp +*.lib +bin + +# Autobuild +Makefile +Makefile.in +aclocal.m4 +autom4te.cache +build-aux +config.* +configure +.deps +.dirstamp +.libs +libtool + +# Generated files +isa-l_crypto.h +/libisal_crypto.la +libisal_crypto.pc diff --git a/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md new file mode 100644 index 000000000..3e95c0b54 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md @@ -0,0 +1,39 @@ +# Contributing to ISA-L_crypto + +Everyone is welcome to contribute. Patches may be submitted using GitHub pull +requests (PRs). All commits must be signed off by the developer (--signoff) +which indicates that you agree to the Developer Certificate of Origin. Patch +discussion will happen directly on the GitHub PR. Design pre-work and general +discussion occurs on the [mailing list]. Anyone can provide feedback in either +location and all discussion is welcome. Decisions on whether to merge patches +will be handled by the maintainer. + +## License + +ISA-L_crypto is licensed using a BSD 3-clause [license]. All code submitted to +the project is required to carry that license. + +## Certificate of Origin + +In order to get a clear contribution chain of trust we use the +[signed-off-by language] used by the Linux kernel project. + +## Mailing List + +Contributors and users are welcome to submit new request on our roadmap, submit +patches, file issues, and ask questions on our [mailing list]. + +## Coding Style + +The coding style for ISA-L_crypto C code roughly follows linux kernel +guidelines. Use the included indent script to format C code. + + ./tools/iindent your_files.c + +And use check format script before submitting. + + ./tools/check_format.sh + +[mailing list]:https://lists.01.org/mailman/listinfo/isal +[license]:LICENSE +[signed-off-by language]:https://01.org/community/signed-process diff --git a/src/crypto/isa-l/isa-l_crypto/Doxyfile b/src/crypto/isa-l/isa-l_crypto/Doxyfile new file mode 100644 index 000000000..9b37aac53 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/Doxyfile @@ -0,0 +1,31 @@ +PROJECT_NAME = "Intel Intelligent Storage Acceleration Library Crypto" +PROJECT_BRIEF = "ISA-L_crypto API reference doc" + +OUTPUT_DIRECTORY = generated_doc +FULL_PATH_NAMES = NO +TAB_SIZE = 8 +ALIASES = "requires=\xrefitem requires \"Requires\" \"Instruction Set Requirements for arch-specific functions (non-multibinary)\"" +OPTIMIZE_OUTPUT_FOR_C = YES +HIDE_UNDOC_MEMBERS = YES +USE_MDFILE_AS_MAINPAGE = README.md + +INPUT = isa-l_crypto.h \ + include \ + README.md \ + CONTRIBUTING.md \ + Release_notes.txt + +EXCLUDE = include/test.h include/memcpy_inline.h include/intrinreg.h include/endian_helper.h +EXCLUDE_PATTERNS = */include/*_multibinary.h +EXAMPLE_PATH = . aes md5_mb mh_sha1 mh_sha1_murmur3_x64_128 mh_sha256 rolling_hash sha1_mb sha256_mb sha512_mb +PAPER_TYPE = letter +LATEX_SOURCE_CODE = YES +GENERATE_TREEVIEW = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +PREDEFINED = "DECLARE_ALIGNED(n, a)=ALIGN n" \ + __declspec(x)='x' \ + align(x)='ALIGN \ + x' +EXPAND_AS_DEFINED = DECLARE_ALIGNED +EXTENSION_MAPPING = "txt=md" diff --git a/src/crypto/isa-l/isa-l_crypto/LICENSE b/src/crypto/isa-l/isa-l_crypto/LICENSE new file mode 100644 index 000000000..ecebef110 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/LICENSE @@ -0,0 +1,26 @@ + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.am b/src/crypto/isa-l/isa-l_crypto/Makefile.am new file mode 100644 index 000000000..9151aab1b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/Makefile.am @@ -0,0 +1,161 @@ +EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l_crypto.def LICENSE README.md Doxyfile +CLEANFILES = +LDADD = +AM_MAKEFLAGS = --no-print-directory +noinst_HEADERS = +pkginclude_HEADERS = include/test.h include/types.h include/endian_helper.h +noinst_LTLIBRARIES = +INCLUDE = -I $(srcdir)/include/ + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = libisal_crypto.pc +EXTRA_DIST += libisal_crypto.pc.in +CLEANFILES += libisal_crypto.pc + +lsrc= +src_include= +extern_hdrs= +other_src= +check_tests= +unit_tests= +perf_tests= +unit_tests_extra= +perf_tests_extra= +examples= +other_tests= +lsrc32= +lsrc_x86_64= +lsrc_x86_32= +lsrc_aarch64= +lsrc_base_aliases= +unit_tests32= +perf_tests32= + +# Include units +include sha1_mb/Makefile.am +include mh_sha1/Makefile.am +include md5_mb/Makefile.am +include sha256_mb/Makefile.am +include sha512_mb/Makefile.am +include mh_sha1_murmur3_x64_128/Makefile.am +include mh_sha256/Makefile.am +include rolling_hash/Makefile.am +include sm3_mb/Makefile.am +include aes/Makefile.am + +# LIB version info not necessarily the same as package version +LIBISAL_CURRENT=2 +LIBISAL_REVISION=24 +LIBISAL_AGE=0 + +lib_LTLIBRARIES = libisal_crypto.la +pkginclude_HEADERS += $(sort ${extern_hdrs}) +libisal_crypto_la_SOURCES = ${lsrc} +if CPU_X86_64 +libisal_crypto_la_SOURCES += ${lsrc_x86_64} +endif + +if CPU_X86_32 +libisal_crypto_la_SOURCES += ${lsrc_x86_32} +endif + +if CPU_AARCH64 +libisal_crypto_la_SOURCES += ${lsrc_aarch64} +endif + +if CPU_UNDEFINED +libisal_crypto_la_SOURCES += ${lsrc_base_aliases} +endif + +nobase_include_HEADERS = isa-l_crypto.h +libisal_crypto_la_LDFLAGS = $(AM_LDFLAGS) \ + -version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE) +libisal_crypto_la_LIBADD = ${noinst_LTLIBRARIES} + +EXTRA_DIST += ${other_src} +EXTRA_DIST += Release_notes.txt + +# For tests +LDADD += libisal_crypto.la +check_PROGRAMS = ${check_tests} +TESTS = ${check_tests} + +# For additional tests +EXTRA_PROGRAMS = ${unit_tests} +EXTRA_PROGRAMS += ${perf_tests} +EXTRA_PROGRAMS += ${other_tests} +EXTRA_PROGRAMS += ${examples} +CLEANFILES += ${EXTRA_PROGRAMS} + +perfs: ${perf_tests} +tests: ${unit_tests} +checks: ${check_tests} +other: ${other_tests} +perf: $(addsuffix .run,$(perf_tests)) +ex: ${examples} +test: $(addsuffix .run,$(unit_tests)) + +# Build rule to run tests +%.run: % + $< + @echo Completed run: $< + +# Support for yasm/nasm +if INTEL_CET_ENABLED + export CET_LD=$(LD) +endif +if USE_YASM +if INTEL_CET_ENABLED + as_filter = ${srcdir}/tools/yasm-cet-filter.sh +else + as_filter = ${srcdir}/tools/yasm-filter.sh +endif +endif +if USE_NASM +if INTEL_CET_ENABLED + as_filter = ${srcdir}/tools/nasm-cet-filter.sh +else + as_filter = ${srcdir}/tools/nasm-filter.sh +endif +endif +if CPU_AARCH64 + as_filter = $(CC) -D__ASSEMBLY__ +endif +CCAS = $(as_filter) +EXTRA_DIST += tools/yasm-filter.sh tools/nasm-filter.sh +EXTRA_DIST += tools/yasm-cet-filter.sh tools/nasm-cet-filter.sh + +AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} $(src_include) ${D} +AM_CCASFLAGS = ${yasm_args} ${INCLUDE} $(src_include) ${DEFS} ${D} + +.asm.s: + @echo " MKTMP " $@; + @cp $< $@ + +# Generate isa-l_crypto.h +BUILT_SOURCES = isa-l_crypto.h +CLEANFILES += isa-l_crypto.h +isa-l_crypto.h: + @echo 'Building $@' + @echo '' >> $@ + @echo '/**' >> $@ + @echo ' * @file isa-l_crypto.h'>> $@ + @echo ' * @brief Include for ISA-L_crypto library' >> $@ + @echo ' */' >> $@ + @echo '' >> $@ + @echo '#ifndef _ISAL_CRYPTO_H_' >> $@ + @echo '#define _ISAL_CRYPTO_H_' >> $@ + @echo '' >> $@ + @echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@ + @echo '#define.ISAL_CRYPTO_MINOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@ + @echo '#define.ISAL_CRYPTO_PATCH_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@ + @echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@ + @echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@ + @echo '' >> $@ + @for unit in $(sort $(extern_hdrs)); do echo "#include " | sed -e 's;include/;;' >> $@; done + @echo '#endif //_ISAL_CRYPTO_H_' >> $@ + +doc: isa-l_crypto.h + (cat Doxyfile; echo 'PROJECT_NUMBER=${VERSION}') | doxygen - + $(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log + cp generated_doc/latex/refman.pdf isa-l_crypto_api_${VERSION}.pdf diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake new file mode 100644 index 000000000..a3e577277 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake @@ -0,0 +1,493 @@ +######################################################################## +# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +# This file can be auto-regenerated with $make -f Makefile.unx Makefile.nmake + +objs = \ + bin\sha1_ctx_sse.obj \ + bin\sha1_ctx_avx.obj \ + bin\sha1_ctx_avx2.obj \ + bin\sha1_ctx_base.obj \ + bin\sha1_mb_mgr_init_sse.obj \ + bin\sha1_mb_mgr_init_avx2.obj \ + bin\sha1_mb_mgr_submit_sse.obj \ + bin\sha1_mb_mgr_submit_avx.obj \ + bin\sha1_mb_mgr_submit_avx2.obj \ + bin\sha1_mb_mgr_flush_sse.obj \ + bin\sha1_mb_mgr_flush_avx.obj \ + bin\sha1_mb_mgr_flush_avx2.obj \ + bin\sha1_mb_x4_sse.obj \ + bin\sha1_mb_x4_avx.obj \ + bin\sha1_mb_x8_avx2.obj \ + bin\sha1_multibinary.obj \ + bin\sha1_ctx_avx512.obj \ + bin\sha1_mb_mgr_init_avx512.obj \ + bin\sha1_mb_mgr_submit_avx512.obj \ + bin\sha1_mb_mgr_flush_avx512.obj \ + bin\sha1_mb_x16_avx512.obj \ + bin\sha1_opt_x1.obj \ + bin\sha1_ni_x1.obj \ + bin\sha1_ni_x2.obj \ + bin\sha1_ctx_sse_ni.obj \ + bin\sha1_ctx_avx512_ni.obj \ + bin\sha1_mb_mgr_submit_sse_ni.obj \ + bin\sha1_mb_mgr_flush_sse_ni.obj \ + bin\sha1_mb_mgr_flush_avx512_ni.obj \ + bin\sha256_ctx_sse.obj \ + bin\sha256_ctx_avx.obj \ + bin\sha256_ctx_avx2.obj \ + bin\sha256_ctx_base.obj \ + bin\sha256_mb_mgr_init_sse.obj \ + bin\sha256_mb_mgr_init_avx2.obj \ + bin\sha256_mb_mgr_submit_sse.obj \ + bin\sha256_mb_mgr_submit_avx.obj \ + bin\sha256_mb_mgr_submit_avx2.obj \ + bin\sha256_mb_mgr_flush_sse.obj \ + bin\sha256_mb_mgr_flush_avx.obj \ + bin\sha256_mb_mgr_flush_avx2.obj \ + bin\sha256_mb_x4_sse.obj \ + bin\sha256_mb_x4_avx.obj \ + bin\sha256_mb_x8_avx2.obj \ + bin\sha256_multibinary.obj \ + bin\sha256_ctx_avx512.obj \ + bin\sha256_mb_mgr_init_avx512.obj \ + bin\sha256_mb_mgr_submit_avx512.obj \ + bin\sha256_mb_mgr_flush_avx512.obj \ + bin\sha256_mb_x16_avx512.obj \ + bin\sha256_opt_x1.obj \ + bin\sha256_ni_x1.obj \ + bin\sha256_ni_x2.obj \ + bin\sha256_ctx_sse_ni.obj \ + bin\sha256_ctx_avx512_ni.obj \ + bin\sha256_mb_mgr_submit_sse_ni.obj \ + bin\sha256_mb_mgr_flush_sse_ni.obj \ + bin\sha256_mb_mgr_flush_avx512_ni.obj \ + bin\sha512_ctx_sse.obj \ + bin\sha512_ctx_avx.obj \ + bin\sha512_ctx_avx2.obj \ + bin\sha512_ctx_sb_sse4.obj \ + bin\sha512_ctx_base.obj \ + bin\sha512_mb_mgr_init_sse.obj \ + bin\sha512_mb_mgr_init_avx2.obj \ + bin\sha512_sb_mgr_init_sse4.obj \ + bin\sha512_mb_mgr_submit_sse.obj \ + bin\sha512_mb_mgr_submit_avx.obj \ + bin\sha512_mb_mgr_submit_avx2.obj \ + bin\sha512_mb_mgr_flush_sse.obj \ + bin\sha512_mb_mgr_flush_avx.obj \ + bin\sha512_mb_mgr_flush_avx2.obj \ + bin\sha512_mb_x2_sse.obj \ + bin\sha512_mb_x2_avx.obj \ + bin\sha512_mb_x4_avx2.obj \ + bin\sha512_multibinary.obj \ + bin\sha512_sb_mgr_submit_sse4.obj \ + bin\sha512_sb_mgr_flush_sse4.obj \ + bin\sha512_sse4.obj \ + bin\sha512_ctx_avx512.obj \ + bin\sha512_mb_mgr_init_avx512.obj \ + bin\sha512_mb_mgr_submit_avx512.obj \ + bin\sha512_mb_mgr_flush_avx512.obj \ + bin\sha512_mb_x8_avx512.obj \ + bin\md5_ctx_sse.obj \ + bin\md5_ctx_avx.obj \ + bin\md5_ctx_avx2.obj \ + bin\md5_ctx_base.obj \ + bin\md5_mb_mgr_init_sse.obj \ + bin\md5_mb_mgr_init_avx2.obj \ + bin\md5_mb_mgr_init_avx512.obj \ + bin\md5_mb_mgr_submit_sse.obj \ + bin\md5_mb_mgr_submit_avx.obj \ + bin\md5_mb_mgr_submit_avx2.obj \ + bin\md5_mb_mgr_flush_sse.obj \ + bin\md5_mb_mgr_flush_avx.obj \ + bin\md5_mb_mgr_flush_avx2.obj \ + bin\md5_mb_x4x2_sse.obj \ + bin\md5_mb_x4x2_avx.obj \ + bin\md5_mb_x8x2_avx2.obj \ + bin\md5_multibinary.obj \ + bin\md5_mb_mgr_submit_avx512.obj \ + bin\md5_mb_mgr_flush_avx512.obj \ + bin\md5_mb_x16x2_avx512.obj \ + bin\md5_ctx_avx512.obj \ + bin\mh_sha1_block_base.obj \ + bin\mh_sha1_finalize_base.obj \ + bin\mh_sha1_update_base.obj \ + bin\sha1_for_mh_sha1.obj \ + bin\mh_sha1.obj \ + bin\mh_sha1_multibinary.obj \ + bin\mh_sha1_block_sse.obj \ + bin\mh_sha1_block_avx.obj \ + bin\mh_sha1_block_avx2.obj \ + bin\mh_sha1_block_avx512.obj \ + bin\mh_sha1_avx512.obj \ + bin\murmur3_x64_128_internal.obj \ + bin\mh_sha1_murmur3_x64_128.obj \ + bin\mh_sha1_murmur3_x64_128_finalize_base.obj \ + bin\mh_sha1_murmur3_x64_128_update_base.obj \ + bin\mh_sha1_murmur3_x64_128_block_sse.obj \ + bin\mh_sha1_murmur3_x64_128_block_avx.obj \ + bin\mh_sha1_murmur3_x64_128_block_avx2.obj \ + bin\mh_sha1_murmur3_x64_128_multibinary.obj \ + bin\mh_sha1_murmur3_x64_128_avx512.obj \ + bin\mh_sha1_murmur3_x64_128_block_avx512.obj \ + bin\sha256_for_mh_sha256.obj \ + bin\mh_sha256.obj \ + bin\mh_sha256_block_sse.obj \ + bin\mh_sha256_block_avx.obj \ + bin\mh_sha256_block_avx2.obj \ + bin\mh_sha256_multibinary.obj \ + bin\mh_sha256_finalize_base.obj \ + bin\mh_sha256_update_base.obj \ + bin\mh_sha256_block_base.obj \ + bin\mh_sha256_block_avx512.obj \ + bin\mh_sha256_avx512.obj \ + bin\rolling_hashx_base.obj \ + bin\rolling_hash2.obj \ + bin\rolling_hash2_until_04.obj \ + bin\rolling_hash2_until_00.obj \ + bin\rolling_hash2_multibinary.obj \ + bin\sm3_ctx_base.obj \ + bin\sm3_multibinary.obj \ + bin\sm3_ctx_avx512.obj \ + bin\sm3_mb_mgr_submit_avx512.obj \ + bin\sm3_mb_mgr_flush_avx512.obj \ + bin\sm3_mb_x16_avx512.obj \ + bin\sm3_ctx_avx2.obj \ + bin\sm3_mb_mgr_submit_avx2.obj \ + bin\sm3_mb_mgr_flush_avx2.obj \ + bin\sm3_mb_x8_avx2.obj \ + bin\gcm_multibinary.obj \ + bin\gcm_pre.obj \ + bin\gcm128_avx_gen2.obj \ + bin\gcm128_avx_gen4.obj \ + bin\gcm128_sse.obj \ + bin\gcm256_avx_gen2.obj \ + bin\gcm256_avx_gen4.obj \ + bin\gcm256_sse.obj \ + bin\gcm128_vaes_avx512.obj \ + bin\gcm256_vaes_avx512.obj \ + bin\gcm128_avx_gen2_nt.obj \ + bin\gcm128_avx_gen4_nt.obj \ + bin\gcm128_sse_nt.obj \ + bin\gcm256_avx_gen2_nt.obj \ + bin\gcm256_avx_gen4_nt.obj \ + bin\gcm256_sse_nt.obj \ + bin\gcm128_vaes_avx512_nt.obj \ + bin\gcm256_vaes_avx512_nt.obj \ + bin\gcm_multibinary_nt.obj \ + bin\keyexp_multibinary.obj \ + bin\keyexp_128.obj \ + bin\keyexp_192.obj \ + bin\keyexp_256.obj \ + bin\cbc_multibinary.obj \ + bin\cbc_dec_128_x4_sse.obj \ + bin\cbc_dec_128_x8_avx.obj \ + bin\cbc_dec_192_x4_sse.obj \ + bin\cbc_dec_192_x8_avx.obj \ + bin\cbc_dec_256_x4_sse.obj \ + bin\cbc_dec_256_x8_avx.obj \ + bin\cbc_enc_128_x4_sb.obj \ + bin\cbc_enc_128_x8_sb.obj \ + bin\cbc_enc_192_x4_sb.obj \ + bin\cbc_enc_192_x8_sb.obj \ + bin\cbc_enc_256_x4_sb.obj \ + bin\cbc_enc_256_x8_sb.obj \ + bin\cbc_dec_vaes_avx512.obj \ + bin\cbc_pre.obj \ + bin\xts_aes_128_multibinary.obj \ + bin\XTS_AES_128_dec_sse.obj \ + bin\XTS_AES_128_dec_expanded_key_sse.obj \ + bin\XTS_AES_128_enc_sse.obj \ + bin\XTS_AES_128_enc_expanded_key_sse.obj \ + bin\XTS_AES_128_dec_avx.obj \ + bin\XTS_AES_128_dec_expanded_key_avx.obj \ + bin\XTS_AES_128_enc_avx.obj \ + bin\XTS_AES_128_enc_expanded_key_avx.obj \ + bin\xts_aes_256_multibinary.obj \ + bin\XTS_AES_256_dec_avx.obj \ + bin\XTS_AES_256_dec_expanded_key_avx.obj \ + bin\XTS_AES_256_enc_avx.obj \ + bin\XTS_AES_256_enc_expanded_key_avx.obj \ + bin\XTS_AES_256_dec_sse.obj \ + bin\XTS_AES_256_dec_expanded_key_sse.obj \ + bin\XTS_AES_256_enc_sse.obj \ + bin\XTS_AES_256_enc_expanded_key_sse.obj \ + bin\XTS_AES_256_enc_vaes.obj \ + bin\XTS_AES_128_enc_vaes.obj \ + bin\XTS_AES_256_enc_expanded_key_vaes.obj \ + bin\XTS_AES_128_enc_expanded_key_vaes.obj \ + bin\XTS_AES_256_dec_vaes.obj \ + bin\XTS_AES_128_dec_vaes.obj \ + bin\XTS_AES_256_dec_expanded_key_vaes.obj \ + bin\XTS_AES_128_dec_expanded_key_vaes.obj + +INCLUDES = -I./ -Isha1_mb/ -Isha256_mb/ -Isha512_mb/ -Imd5_mb/ -Imh_sha1/ -Imh_sha1_murmur3_x64_128/ -Imh_sha256/ -Irolling_hash/ -Ism3_mb/ -Iaes/ -Iinclude/ +# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level +FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI +CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy +CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd +LINKFLAGS = -nologo -incremental:no -debug +CFLAGS = $(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $(FEAT_FLAGS) $(INCLUDES) $(D) +AFLAGS = -f win64 $(FEAT_FLAGS) $(INCLUDES) $(D) +CC = cl +# or CC = icl -Qstd=c99 +AS = nasm + +lib: bin static dll +static: bin isa-l_crypto_static.lib +dll: bin isa-l_crypto.dll + +bin: ; -mkdir $@ + +isa-l_crypto_static.lib: $(objs) + lib -out:$@ @<< +$? +<< + +isa-l_crypto.dll: $(objs) + link -out:$@ -dll -def:isa-l_crypto.def $(LINKFLAGS) @<< +$? +<< + +{sha1_mb}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{sha1_mb}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{sha256_mb}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{sha256_mb}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{sha512_mb}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{sha512_mb}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{md5_mb}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{md5_mb}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{mh_sha1}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{mh_sha1}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{mh_sha1_murmur3_x64_128}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{mh_sha1_murmur3_x64_128}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{mh_sha256}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{mh_sha256}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{rolling_hash}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{rolling_hash}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{sm3_mb}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{sm3_mb}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + +{aes}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{aes}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? + + +# Examples +ex = \ + sha1_multi_buffer_example.exe \ + gcm_simple_example.exe + +ex: lib $(ex) + +$(ex): $(@B).obj + +.obj.exe: + link /out:$@ $(LINKFLAGS) isa-l_crypto.lib $? + +# Check tests +checks = \ + sha1_mb_test.exe \ + sha1_mb_rand_test.exe \ + sha1_mb_rand_update_test.exe \ + sha1_mb_flush_test.exe \ + sha256_mb_test.exe \ + sha256_mb_rand_test.exe \ + sha256_mb_rand_update_test.exe \ + sha256_mb_flush_test.exe \ + sha512_mb_test.exe \ + sha512_mb_rand_test.exe \ + sha512_mb_rand_update_test.exe \ + md5_mb_test.exe \ + md5_mb_rand_test.exe \ + md5_mb_rand_update_test.exe \ + mh_sha1_test.exe \ + mh_sha256_test.exe \ + rolling_hash2_test.exe \ + sm3_ref_test.exe \ + cbc_std_vectors_test.exe \ + gcm_std_vectors_test.exe \ + gcm_nt_std_vectors_test.exe \ + xts_128_test.exe \ + xts_256_test.exe \ + xts_128_expanded_key_test.exe \ + xts_256_expanded_key_test.exe + +checks: lib $(checks) +$(checks): $(@B).obj +check: $(checks) + !$? + +# Unit tests +tests = \ + sha1_mb_rand_ssl_test.exe \ + sha256_mb_rand_ssl_test.exe \ + sha512_mb_rand_ssl_test.exe \ + md5_mb_rand_ssl_test.exe \ + mh_sha1_update_test.exe \ + mh_sha1_murmur3_x64_128_test.exe \ + mh_sha1_murmur3_x64_128_update_test.exe \ + mh_sha256_update_test.exe \ + sm3_mb_rand_ssl_test.exe \ + sm3_mb_rand_test.exe \ + sm3_mb_rand_update_test.exe \ + sm3_mb_flush_test.exe \ + sm3_mb_test.exe \ + cbc_std_vectors_random_test.exe \ + gcm_std_vectors_random_test.exe \ + gcm_nt_rand_test.exe \ + xts_128_rand.exe \ + xts_128_rand_ossl_test.exe \ + xts_256_rand.exe \ + xts_256_rand_ossl_test.exe + +tests: lib $(tests) +$(tests): $(@B).obj + +# Performance tests +perfs = \ + sha1_mb_vs_ossl_perf.exe \ + sha1_mb_vs_ossl_shortage_perf.exe \ + sha256_mb_vs_ossl_perf.exe \ + sha256_mb_vs_ossl_shortage_perf.exe \ + sha512_mb_vs_ossl_perf.exe \ + md5_mb_vs_ossl_perf.exe \ + mh_sha1_perf.exe \ + mh_sha1_murmur3_x64_128_perf.exe \ + mh_sha256_perf.exe \ + rolling_hash2_perf.exe \ + sm3_mb_vs_ossl_perf.exe \ + sm3_mb_vs_ossl_shortage_perf.exe \ + cbc_ossl_perf.exe \ + gcm_ossl_perf.exe \ + xts_128_enc_ossl_perf.exe \ + xts_256_enc_ossl_perf.exe \ + xts_128_enc_perf.exe \ + xts_128_dec_perf.exe \ + xts_128_dec_ossl_perf.exe \ + xts_256_enc_perf.exe \ + xts_256_dec_perf.exe \ + xts_256_dec_ossl_perf.exe + +perfs: lib $(perfs) +$(perfs): $(@B).obj + +progs = + +progs: lib $(progs) + +clean: + -if exist *.obj del *.obj + -if exist bin\*.obj del bin\*.obj + -if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib + -if exist *.exe del *.exe + -if exist *.pdb del *.pdb + -if exist isa-l_crypto.lib del isa-l_crypto.lib + -if exist isa-l_crypto.dll del isa-l_crypto.dll + -if exist isa-l_crypto.exp del isa-l_crypto.exp + +libcrypto.lib: +sha1_mb_rand_test.exe: sha1_ref.obj +sha1_mb_rand_update_test.exe: sha1_ref.obj +sha1_mb_flush_test.exe: sha1_ref.obj +sha1_mb_rand_ssl_test.exe: libcrypto.lib +sha1_mb_vs_ossl_perf.exe: libcrypto.lib +sha1_mb_vs_ossl_shortage_perf.exe: libcrypto.lib +sha256_mb_rand_ssl_test.exe: sha256_ref.obj +sha256_mb_rand_test.exe: sha256_ref.obj +sha256_mb_rand_update_test.exe: sha256_ref.obj +sha256_mb_flush_test.exe: sha256_ref.obj +sha256_mb_rand_ssl_test.exe: libcrypto.lib +sha256_mb_vs_ossl_perf.exe: libcrypto.lib +sha256_mb_vs_ossl_shortage_perf.exe: libcrypto.lib +sha512_mb_rand_test.exe: sha512_ref.obj +sha512_mb_rand_update_test.exe: sha512_ref.obj +sha512_mb_rand_ssl_test.exe: libcrypto.lib +sha512_mb_vs_ossl_perf.exe: libcrypto.lib +md5_mb_rand_test.exe: md5_ref.obj +md5_mb_rand_update_test.exe: md5_ref.obj +md5_mb_rand_ssl_test.exe: libcrypto.lib +md5_mb_vs_ossl_perf.exe: libcrypto.lib +mh_sha1_test.exe: mh_sha1_ref.obj +mh_sha1_update_test.exe: mh_sha1_ref.obj +mh_sha1_murmur3_x64_128_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj +mh_sha1_murmur3_x64_128_update_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj +mh_sha1_murmur3_x64_128_perf.exe: mh_sha1_ref.obj murmur3_x64_128.obj +mh_sha256_test.exe: mh_sha256_ref.obj +mh_sha256_update_test.exe: mh_sha256_ref.obj +sm3_mb_rand_ssl_test.exe: libcrypto.lib +sm3_mb_rand_ssl_test.exe: sm3_test_helper.obj +sm3_mb_rand_update_test.exe: libcrypto.lib +sm3_mb_rand_update_test.exe: sm3_test_helper.obj +sm3_mb_flush_test.exe: libcrypto.lib +sm3_mb_flush_test.exe: sm3_test_helper.obj +sm3_mb_rand_test.exe: libcrypto.lib +sm3_mb_rand_test.exe: sm3_test_helper.obj +sm3_mb_vs_ossl_perf.exe: libcrypto.lib +sm3_mb_vs_ossl_perf.exe: sm3_test_helper.obj +sm3_mb_vs_ossl_shortage_perf.exe: libcrypto.lib +sm3_mb_vs_ossl_shortage_perf.exe: sm3_test_helper.obj +cbc_ossl_perf.exe: libcrypto.lib +cbc_std_vectors_random_test.exe: libcrypto.lib +gcm_ossl_perf.exe: libcrypto.lib +gcm_std_vectors_random_test.exe: libcrypto.lib +gcm_nt_rand_test.exe: libcrypto.lib +xts_128_enc_ossl_perf.exe: libcrypto.lib +xts_128_dec_ossl_perf.exe: libcrypto.lib +xts_128_rand_ossl_test.exe: libcrypto.lib +xts_256_enc_ossl_perf.exe: libcrypto.lib +xts_256_dec_ossl_perf.exe: libcrypto.lib +xts_256_rand_ossl_test.exe: libcrypto.lib diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.unx b/src/crypto/isa-l/isa-l_crypto/Makefile.unx new file mode 100644 index 000000000..7452f71b0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/Makefile.unx @@ -0,0 +1,50 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +host_cpu ?= $(shell uname -m | sed -e 's/amd/x86_/') +arch ?= $(shell uname | grep -v -e Linux -e BSD ) + + + +units ?=sha1_mb sha256_mb sha512_mb md5_mb mh_sha1 mh_sha1_murmur3_x64_128 \ + mh_sha256 rolling_hash sm3_mb +ifneq ($(arch),noarch) +units +=aes +endif +ifeq ($(host_cpu)_$(arch),aarch64_) + arch = aarch64 +endif +default: lib +include $(foreach unit,$(units), $(unit)/Makefile.am) + +# Override individual lib names to make one inclusive library. +lib_name := bin/isa-l_crypto.a + +include make.inc +include tools/gen_nmake.mk +VPATH = . $(units) include diff --git a/src/crypto/isa-l/isa-l_crypto/README.md b/src/crypto/isa-l/isa-l_crypto/README.md new file mode 100644 index 000000000..f9f560c54 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/README.md @@ -0,0 +1,63 @@ +Intel(R) Intelligent Storage Acceleration Library Crypto Version +================================================================ + +ISA-L_crypto is a collection of optimized low-level functions targeting storage +applications. ISA-L_crypto includes: + +* Multi-buffer hashes - run multiple hash jobs together on one core for much + better throughput than single-buffer versions. + - SHA1, SHA256, SHA512, MD5, SM3 + +* Multi-hash - Get the performance of multi-buffer hashing with a single-buffer + interface. Specification ref : [Multi-Hash white paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/multi-hash-paper.pdf) + +* Multi-hash + murmur - run both together. + +* AES - block ciphers + - XTS, GCM, CBC + +* Rolling hash - Hash input in a window which moves through the input + +Also see: +* [ISA-L_crypto for updates](https://github.com/intel/isa-l_crypto). +* For non-crypto ISA-L see [isa-l on github](https://github.com/intel/isa-l). +* The [github wiki](https://github.com/intel/isa-l/wiki) covering isa-l and + isa-l crypto. +* [Contributing](CONTRIBUTING.md). + +Building ISA-L +-------------- + +### Prerequisites + +* Assembler: nasm v2.11.01 or later (nasm v2.13 or better suggested for building in AVX512 support) + or yasm version 1.2.0 or later. +* Compiler: gcc, clang, icc or VC compiler. +* Make: GNU 'make' or 'nmake' (Windows). +* Optional: Building with autotools requires autoconf/automake packages. + +### Autotools +To build and install the library with autotools it is usually sufficient to run: + + ./autogen.sh + ./configure + make + sudo make install + +### Makefile +To use a standard makefile run: + + make -f Makefile.unx + +### Windows +On Windows use nmake to build dll and static lib: + + nmake -f Makefile.nmake + +### Other make targets +Other targets include: +* `make check` : create and run tests +* `make tests` : create additional unit tests +* `make perfs` : create included performance tests +* `make ex` : build examples +* `make doc` : build API manual diff --git a/src/crypto/isa-l/isa-l_crypto/Release_notes.txt b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt new file mode 100644 index 000000000..097107585 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt @@ -0,0 +1,215 @@ +================================================================================ +v2.24 Intel Intelligent Storage Acceleration Library Crypto Release Notes +================================================================================ + +================================================================================ +RELEASE NOTE CONTENTS +================================================================================ +1. KNOWN ISSUES +2. FIXED ISSUES +3. CHANGE LOG & FEATURES ADDED + +================================================================================ +1. KNOWN ISSUES +================================================================================ + +* Perf tests do not run in Windows environment. + +* 32-bit lib is not supported in Windows. + +================================================================================ +2. FIXED ISSUES +================================================================================ +v2.21 + +* Put correct vec instruction versions in aes_cbc_enc_{128,192,256}(). May help + performance on some systems. + +v2.20 + +* Fix issue with new aes_gcm API, aes_gcm_pre_256 was incorrect. + +* Multi-buffer hash max length extended. Previous max length for + {sha1,sha256,sha512,md5}_mb was 4095MB. While there is still a 4GB limit for + each submit, the total hashed length can now be larger then 4GB. + +v2.18 + +* Fix for multi-buffer hash when total length is above 512MB. + +v2.14 + +* Building in unit directories is no longer supported removing the issue of + leftover object files causing the top-level make build to fail. + +v2.9 + +* Multi-buffer MD5 AVX2 tests fixed to work on FreeBSD 9.1 by explicitly aligning + structures. + +v2.7 + +* Unit tests and examples are now supported in Windows environment + + +================================================================================ +3. CHANGE LOG & FEATURES ADDED +================================================================================ +v2.24 + +* New optimized version of AES-CBC decode + +* New AVX2, 8 lane version of multi-buffer SM3 + +* Added support for big-endian architectures + +v2.23 + +* New optimized versions of block ciphers AES-GCM and AES-XTS. + +* New optimized versions of multi-buffer SM3 hashing. Removed experimental + status. + +v2.22 + +* New multi-buffer SM3 functions. Experimental base functions only. + +* New multi-arch support. + +v2.21 + +* Multi-buffer hash performance improvement for Intel(R) Atom(tm) processors. + New by-2 shani versions for multi-buffer sha1 & sha256. + +* New base functions for multi-buffer hashes. + md5_mb, sha1_mb, sha256_mb, sha512_mb. + +v2.20 + +* New functions + - Non-temporal versions of aes_gcm added. + +* Multi-buffer hash improvement + - Increase max length of hash in {sha1,sha256,sha512,md5}_mb to > 4GB. + +v2.19 + +* Multi-buffer hash (sha1_mb, sha256_mb) + + - Choose fast single buffer routine to do flush operation if lanes aren't full. + + - Add SHA-NI support for Goldmont and Cannonlake. + +* AES-GCM interface updates. + + - New interface separates the expanded keys and other context into two + structures. The old interface is maintained for backward compatibility. + + - User no longer has to append the GCM_IV_END_MARK manually to then end of iv + as this is now done automatically. This update should also improve performance + of small packets. + +* Rolling hash is released. + +v2.18 + +* New multi-hash SHA256-based version. + +v2.16 + +* Split lib from non-crypto functions. + +v2.15 + +* Multi-buffer hash updates. New AVX512 versions for multi-buffer SHA1, SHA256, + SHA512, MD5 and SHA1+murmur3_x64_128 stitched. + +* Removes restrictions on AAD length in AES-GCM. Previously AAD length was + limited to a multiple of 4 bytes. Now any AAD length is allowed. + +* Nasm support. ISA-L ported to build with nasm or yasm assembler. + +* Windows DLL support. Windows builds DLL by default. + +* The older, deprecated multi-buffer API has been removed. + +v2.14 + +* New multi-hash sha1 function and multi-hash sha1 + murmur3_x64_128 stitched. + Multi-hash is designed to give the performance of multi-buffer cryptographic + hashes with a synchronous single buffer interface. + +* New AES-GCM and AES-CBC functions added. + +* Autoconf and autotools build allows easier porting to additional systems. + Previous make system still available to embedded users with Makefile.unx. + +* The AES key expand functions that were used for AES-XTS with pre-expanded keys + now expand the decrypt keys in a different order. The order that decrypt keys + are stored and used by XTS_AES_128_dec_expanded_key() is reversed from + previous versions to be compatable with CBC and GCM key expansion. The + aes_keyexp_*() and XTS_AES_128_dec_expanded_key() functions should work the + same when paired together. + +* Includes update for building on Mac OS X/darwin systems. Add --target=darwin + to ./configure step. + +v2.10 + +* Added multi-buffer MD5 in the new hash API. Includes multi-binary capability, + no restriction on update length and other benefits of the CTX API. + +v2.9 + +* New multi-buffer hash API. The new API brings the following new features to + multi-buffer hashes. The older API is still included but may be deprecated in + future releases. + + - Multibinary functionality. Call one function and the appropriate + architecture-specific version is fixed up at runtime. + + - No restriction on update length. Submitting an update block no longer has + to have length a multiple of the fundamental block size. + +* New expanded key tests added for AES-XTS 128 and 256 + +v2.7 + +* New AVX2 versions for mb_md5 and mb_sha512 hashing code have been added. + +v2.6 + +* Update buffer functionality added to mb_md5, mb_sha256 and mb_sha512 hashing + code. Requires API changes to current interface to specify job type and total + length of hash. + +* New AVX2 versions for mb_sha1 and mb_sha256 hashing code have been added. + +v2.5 + +* New feature for multi-buffer SHA-1, update buffer. mb_sha1 non-finalize jobs + can now be submitted by setting flags in job structure. Requires API changes + to current interface to specify job type and total length of hash. + +v2.4 + +* Added new multi-buffer SHA-512: mb_sha512. SSE, AVX versions. + +v2.3 + +* Added improved AES XTS versions. + +v2.2 + +* Added new AVX versions of multi-buffer hashes +* Changed type in the interface struct for multi-buffer hashes + the len field in the following structures :JOB_SHA1,JOB_MD5, + JOB_SHA256 is now a 32-bit int. + +v2.0 + +* Added AES XTS units aes_xts_128, aes_xts_256 + +v1.3 + +* Added new multi-buffer units for SHA-256 and MD5: mb_sha256, mb_md5. diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am new file mode 100644 index 000000000..d1f4e5781 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am @@ -0,0 +1,170 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +# Assembling AES requires including cbc_common.asm, gcm_defines.asm +src_include += -I $(srcdir)/aes + +extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h + +lsrc_x86_64 += aes/gcm_multibinary.asm aes/gcm_pre.c +lsrc_x86_64 += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm +lsrc_x86_64 += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm +lsrc_x86_64 += aes/gcm128_vaes_avx512.asm aes/gcm256_vaes_avx512.asm +lsrc_x86_64 += aes/gcm128_avx_gen2_nt.asm aes/gcm128_avx_gen4_nt.asm aes/gcm128_sse_nt.asm +lsrc_x86_64 += aes/gcm256_avx_gen2_nt.asm aes/gcm256_avx_gen4_nt.asm aes/gcm256_sse_nt.asm +lsrc_x86_64 += aes/gcm128_vaes_avx512_nt.asm aes/gcm256_vaes_avx512_nt.asm + +lsrc_x86_64 += aes/gcm_multibinary_nt.asm + +lsrc_x86_64 += aes/keyexp_multibinary.asm +lsrc_x86_64 += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm +lsrc_x86_64 += aes/cbc_multibinary.asm +lsrc_x86_64 += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm +lsrc_x86_64 += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm +lsrc_x86_64 += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm +lsrc_x86_64 += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm +lsrc_x86_64 += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm +lsrc_x86_64 += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm +lsrc_x86_64 += aes/cbc_dec_vaes_avx512.asm +lsrc_x86_64 += aes/cbc_pre.c +lsrc_x86_64 += aes/xts_aes_128_multibinary.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm +lsrc_x86_64 += aes/xts_aes_256_multibinary.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_expanded_key_vaes.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += aes/gcm_pre.c \ + aes/aarch64/gcm_multibinary_aarch64.S \ + aes/aarch64/keyexp_multibinary_aarch64.S \ + aes/aarch64/gcm_aarch64_dispatcher.c \ + aes/aarch64/keyexp_aarch64_dispatcher.c \ + aes/aarch64/keyexp_128_aarch64_aes.S \ + aes/aarch64/keyexp_192_aarch64_aes.S \ + aes/aarch64/keyexp_256_aarch64_aes.S \ + aes/aarch64/aes_gcm_aes_finalize_128.S \ + aes/aarch64/aes_gcm_aes_init.S \ + aes/aarch64/aes_gcm_enc_dec_128.S \ + aes/aarch64/aes_gcm_precomp_128.S \ + aes/aarch64/aes_gcm_update_128.S \ + aes/aarch64/aes_gcm_aes_finalize_256.S \ + aes/aarch64/aes_gcm_consts.S \ + aes/aarch64/aes_gcm_enc_dec_256.S \ + aes/aarch64/aes_gcm_precomp_256.S \ + aes/aarch64/aes_gcm_update_256.S \ + aes/aarch64/xts_aarch64_dispatcher.c \ + aes/aarch64/xts_aes_128_dec.S \ + aes/aarch64/xts_aes_128_enc.S \ + aes/aarch64/xts_keyexp_aes_128_dec.S \ + aes/aarch64/xts_keyexp_aes_128_enc.S \ + aes/aarch64/xts_aes_256_dec.S \ + aes/aarch64/xts_aes_256_enc.S \ + aes/aarch64/xts_keyexp_aes_256_dec.S \ + aes/aarch64/xts_keyexp_aes_256_enc.S \ + aes/aarch64/xts_multibinary_aarch64.S \ + aes/cbc_pre.c \ + aes/aarch64/cbc_multibinary_aarch64.S \ + aes/aarch64/cbc_aarch64_dispatcher.c \ + aes/aarch64/cbc_enc_aes.S \ + aes/aarch64/cbc_dec_aes.S + +other_src += include/multibinary.asm +other_src += include/test.h include/types.h include/reg_sizes.asm +other_src += aes/gcm_defines.asm +other_src += aes/aes_common.asm +other_src += aes/clear_regs.asm +other_src += aes/cbc_common.asm aes/cbc_std_vectors.h +other_src += aes/gcm_vectors.h aes/ossl_helper.h +other_src += aes/xts_128_vect.h +other_src += aes/xts_256_vect.h +other_src += aes/gcm_sse.asm +other_src += aes/gcm_avx_gen2.asm +other_src += aes/gcm_avx_gen4.asm +other_src += aes/gcm_keys_vaes_avx512.asm +other_src += aes/gcm_vaes_avx512.asm + +check_tests += aes/cbc_std_vectors_test +check_tests += aes/gcm_std_vectors_test +check_tests += aes/gcm_nt_std_vectors_test +check_tests += aes/xts_128_test +check_tests += aes/xts_256_test +check_tests += aes/xts_128_expanded_key_test +check_tests += aes/xts_256_expanded_key_test + +unit_tests += aes/cbc_std_vectors_random_test +unit_tests += aes/gcm_std_vectors_random_test +unit_tests += aes/gcm_nt_rand_test +unit_tests += aes/xts_128_rand aes/xts_128_rand_ossl_test +unit_tests += aes/xts_256_rand aes/xts_256_rand_ossl_test + +perf_tests += aes/cbc_ossl_perf +perf_tests += aes/gcm_ossl_perf +perf_tests += aes/xts_128_enc_ossl_perf +perf_tests += aes/xts_256_enc_ossl_perf +perf_tests += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf +perf_tests += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf + +examples += aes/gcm_simple_example + +cbc_ossl_perf: LDLIBS += -lcrypto +aes_cbc_ossl_perf_LDFLAGS = -lcrypto +cbc_std_vectors_random_test: LDLIBS += -lcrypto +aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto +gcm_ossl_perf: LDLIBS += -lcrypto +aes_gcm_ossl_perf_LDFLAGS = -lcrypto +gcm_std_vectors_random_test: LDLIBS += -lcrypto +aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto +gcm_nt_rand_test: LDLIBS += -lcrypto +aes_gcm_nt_rand_test_LDFLAGS = -lcrypto +xts_128_enc_ossl_perf: LDLIBS += -lcrypto +aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto +xts_128_dec_ossl_perf: LDLIBS += -lcrypto +aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto +xts_128_rand_ossl_test: LDLIBS += -lcrypto +aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto +xts_256_enc_ossl_perf : LDLIBS += -lcrypto +aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto +xts_256_dec_ossl_perf : LDLIBS += -lcrypto +aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto +xts_256_rand_ossl_test: LDLIBS += -lcrypto +aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm new file mode 100644 index 000000000..85582c0df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm @@ -0,0 +1,1778 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_avx( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values are generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_avx, function +XTS_AES_128_dec_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm new file mode 100644 index 000000000..faa7e895e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm @@ -0,0 +1,1748 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_avx, function +XTS_AES_128_dec_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm new file mode 100644 index 000000000..0b1b637be --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm @@ -0,0 +1,1747 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*0] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_sse, function +XTS_AES_128_dec_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm new file mode 100644 index 000000000..7f243949a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm @@ -0,0 +1,1648 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_vaes, function +XTS_AES_128_dec_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_dec_expanded_key_vaes +no_XTS_AES_128_dec_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm new file mode 100644 index 000000000..19f887c2f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm @@ -0,0 +1,1779 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_sse( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + movdqu %%xkey2, [%%ptr_key2] + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values are generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_sse, function +XTS_AES_128_dec_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;decrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm new file mode 100644 index 000000000..e3435dd83 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm @@ -0,0 +1,1681 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_vavx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_vaes, function +XTS_AES_128_dec_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_dec_vaes +no_XTS_AES_128_dec_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm new file mode 100644 index 000000000..819617283 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm @@ -0,0 +1,1531 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_avx, function +XTS_AES_128_enc_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm new file mode 100644 index 000000000..f0f5f02f5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm @@ -0,0 +1,1506 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" +default rel + +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_avx, function +XTS_AES_128_enc_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm new file mode 100644 index 000000000..8ac162c4c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm @@ -0,0 +1,1505 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_sse, function +XTS_AES_128_enc_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm new file mode 100644 index 000000000..730fdcba9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm @@ -0,0 +1,1473 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_vaes, function +XTS_AES_128_enc_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_enc_expanded_key_vaes +no_XTS_AES_128_enc_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm new file mode 100644 index 000000000..cbb98cc38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm @@ -0,0 +1,1530 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_sse( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + movdqu %%xkey2, [%%ptr_key2] + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_sse, function +XTS_AES_128_enc_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm new file mode 100644 index 000000000..3532ddda5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm @@ -0,0 +1,1498 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_vavx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_vaes, function +XTS_AES_128_enc_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweak values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweak values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweak values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7*16) + cmp tmp1, (6*16) + je _num_blocks_is_6 + cmp tmp1, (5*16) + je _num_blocks_is_5 + cmp tmp1, (4*16) + je _num_blocks_is_4 + cmp tmp1, (3*16) + je _num_blocks_is_3 + cmp tmp1, (2*16) + je _num_blocks_is_2 + cmp tmp1, (1*16) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_enc_vaes +no_XTS_AES_128_enc_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm new file mode 100644 index 000000000..776525bdd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm @@ -0,0 +1,1962 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_avx, function +XTS_AES_256_dec_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm new file mode 100644 index 000000000..d52d0977e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm @@ -0,0 +1,1896 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_avx, function +XTS_AES_256_dec_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm new file mode 100644 index 000000000..2e77e5e80 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm @@ -0,0 +1,1898 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*14] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*13] + movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*12] + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*11] + movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*11] + aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*12] + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*13] + aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*14] + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*0] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_sse, function +XTS_AES_256_dec_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdec xmm8, [keys + 16*10] ; round 9 + aesdec xmm8, [keys + 16*11] ; round 9 + aesdec xmm8, [keys + 16*12] ; round 9 + aesdec xmm8, [keys + 16*13] ; round 9 + aesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm new file mode 100644 index 000000000..69228c18c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm @@ -0,0 +1,1808 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_vaes, function +XTS_AES_256_dec_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_dec_expanded_key_vaes +no_XTS_AES_256_dec_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm new file mode 100644 index 000000000..3904c8a54 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm @@ -0,0 +1,1963 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_sse( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 10101010b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + movdqu %%xkey2_2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + movdqu %%xkey1_2, [%%ptr_key1 + 16*1] + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_sse, function +XTS_AES_256_dec_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdec xmm8, [keys + 16*10] ; round 9 + aesdec xmm8, [keys + 16*11] ; round 9 + aesdec xmm8, [keys + 16*12] ; round 9 + aesdec xmm8, [keys + 16*13] ; round 9 + aesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm new file mode 100644 index 000000000..3e26e5c04 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm @@ -0,0 +1,1875 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_vaes, function +XTS_AES_256_dec_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 10 + vaesdec xmm8, [keys + 16*11] ; round 11 + vaesdec xmm8, [keys + 16*12] ; round 12 + vaesdec xmm8, [keys + 16*13] ; round 13 + vaesdeclast xmm8, [keys + 16*14] ; round 14 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_dec_vaes +no_XTS_AES_256_dec_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm new file mode 100644 index 000000000..0993ff909 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm @@ -0,0 +1,1708 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_avx, function +XTS_AES_256_enc_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm new file mode 100644 index 000000000..6db85486d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm @@ -0,0 +1,1653 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_avx, function +XTS_AES_256_enc_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm new file mode 100644 index 000000000..51cb31074 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm @@ -0,0 +1,1652 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*11] + aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*11] + movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*12] + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*12] + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*13] + aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*13] + movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*14] + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*14] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_sse, function +XTS_AES_256_enc_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenc xmm8, [keys + 16*10] ; round 9 + aesenc xmm8, [keys + 16*11] ; round 9 + aesenc xmm8, [keys + 16*12] ; round 9 + aesenc xmm8, [keys + 16*13] ; round 9 + aesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm new file mode 100644 index 000000000..37a5dc792 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm @@ -0,0 +1,1634 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_vaes, function +XTS_AES_256_enc_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_enc_expanded_key_vaes +no_XTS_AES_256_enc_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm new file mode 100644 index 000000000..5b805b74d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm @@ -0,0 +1,1708 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_sse( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 10101010b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + movdqu %%xkey2_2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + movdqu %%xkey1_2, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_sse, function +XTS_AES_256_enc_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenc xmm8, [keys + 16*10] ; round 9 + aesenc xmm8, [keys + 16*11] ; round 9 + aesenc xmm8, [keys + 16*12] ; round 9 + aesenc xmm8, [keys + 16*13] ; round 9 + aesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm new file mode 100644 index 000000000..f75497ece --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm @@ -0,0 +1,1687 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_vaes, function +XTS_AES_256_enc_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 10 + vaesenc xmm8, [keys + 16*11] ; round 11 + vaesenc xmm8, [keys + 16*12] ; round 12 + vaesenc xmm8, [keys + 16*13] ; round 13 + vaesenclast xmm8, [keys + 16*14] ; round 14 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_enc_vaes +no_XTS_AES_256_enc_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S new file mode 100644 index 000000000..7214f0f25 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S @@ -0,0 +1,215 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common_128.S" +/* + void gist_aes_gcm_enc_finalize_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + ) +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg auth_tag ,2 + declare_var_generic_reg auth_tag_len ,3 + declare_var_generic_reg partial_block_len ,4 + declare_var_generic_reg partial_block ,1 + + declare_var_generic_reg hashkey_addr ,0 + declare_var_generic_reg temp0, 6 + + declare_var_vector_reg OrigIV ,0 + declare_var_vector_reg AadHash ,1 + declare_var_vector_reg HashKey0 ,2 + declare_var_vector_reg HashKey0Ext ,3 + declare_var_vector_reg High ,4 + declare_var_vector_reg Low ,5 + declare_var_vector_reg Middle0 ,6 + declare_var_vector_reg Len ,7 + declare_var_vector_reg Tmp0 ,8 + declare_var_vector_reg Tmp1 ,9 + declare_var_vector_reg Zero ,10 + declare_var_vector_reg Poly ,11 + declare_var_vector_reg PartitialBlock ,13 + + declare_var_vector_reg Tmp2 ,31 + declare_var_vector_reg Tmp3 ,12 + + .set stack_size,48 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm +START_FUNC(enc,KEY_LEN,_finalize_) +START_FUNC(dec,KEY_LEN,_finalize_) + ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF] + load_aes_keys key_data + push_stack + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */ + ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/ + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* complete part */ + cbnz partial_block_len,10f + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + shl vLen.2d,vLen.2d,3 /* Len */ + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key4 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key5 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key7 + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key8 + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + aese vOrigIV.16b,vKey9.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey10.16b + rbit vAadHash.16b,vOrigIV.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +10: /* cbnz partial_block_len,10f */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32] + aes_encrypt_round OrigIV,Key0 + read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key4 + rbit vAadHash.16b,vPartitialBlock.16b + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key5 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key7 + eor vLow.16b,vLow.16b,vTmp1.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key8 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aese vOrigIV.16b,vKey9.16b + pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + eor vOrigIV.16b,vOrigIV.16b,vKey10.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + rbit vAadHash.16b,vOrigIV.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_finalize_) +END_FUNC(dec,KEY_LEN,_finalize_) + + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S new file mode 100644 index 000000000..9eda7178e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S @@ -0,0 +1,220 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common_256.S" +/* + void gist_aes_gcm_enc_finalize_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + ) +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg auth_tag ,2 + declare_var_generic_reg auth_tag_len ,3 + declare_var_generic_reg partial_block_len ,4 + declare_var_generic_reg partial_block ,1 + + declare_var_generic_reg hashkey_addr ,0 + declare_var_generic_reg temp0 ,6 + + declare_var_vector_reg OrigIV ,0 + declare_var_vector_reg AadHash ,1 + declare_var_vector_reg HashKey0 ,2 + declare_var_vector_reg HashKey0Ext ,3 + declare_var_vector_reg High ,4 + declare_var_vector_reg Low ,5 + declare_var_vector_reg Middle0 ,6 + declare_var_vector_reg Len ,7 + declare_var_vector_reg Tmp0 ,8 + declare_var_vector_reg Tmp1 ,9 + declare_var_vector_reg Zero ,10 + declare_var_vector_reg Poly ,11 + declare_var_vector_reg PartitialBlock ,13 + + declare_var_vector_reg Tmp2 ,31 + declare_var_vector_reg Tmp3 ,12 + + .set stack_size,48 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + .endm + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm + +START_FUNC(enc,KEY_LEN,_finalize_) +START_FUNC(dec,KEY_LEN,_finalize_) + ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF] + load_aes_keys key_data + push_stack + + ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */ + ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/ + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* complete part */ + cbnz partial_block_len,10f + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + shl vLen.2d,vLen.2d,3 /* Len */ + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key4 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key5 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key7 + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key8 + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key9 + aes_encrypt_round OrigIV,Key10 + aes_encrypt_round OrigIV,Key11 + aes_encrypt_round OrigIV,Key12 + aese vOrigIV.16b,vKey13.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey14.16b + rbit vAadHash.16b,vOrigIV.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +10: /* cbnz partial_block_len,10f */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32] + aes_encrypt_round OrigIV,Key0 + read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key4 + rbit vAadHash.16b,vPartitialBlock.16b + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key5 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key7 + eor vLow.16b,vLow.16b,vTmp1.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key8 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key9 + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key10 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key11 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key12 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aese vOrigIV.16b,vKey13.16b + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey14.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + rbit vAadHash.16b,vOrigIV.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_finalize_) +END_FUNC(dec,KEY_LEN,_finalize_) + + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S new file mode 100644 index 000000000..0dd94c6b7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S @@ -0,0 +1,161 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common.S" +/* +void gist_aes_gcm_init_##mode( + const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t *iv, + uint8_t const *aad, + uint64_t aad_len + ); +*/ + key_data .req x0 + context .req x1 + iv .req x2 + aad .req x3 + aad_len .req x4 + temp0 .req x7 + wtemp0 .req w7 + temp1 .req x6 + left_len .req x5 + aad_left .req x2 + small_tbl_adr .req x6 + + hashkey_base .req x0 + hashkey_addr .req x2 + + declare_var_vector_reg AadHash,0 + declare_var_vector_reg Dat0,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Middle0,5 + declare_var_vector_reg Low,6 + declare_var_vector_reg LeftDat,7 + declare_var_vector_reg Zero,16 + declare_var_vector_reg Poly,17 + + declare_var_vector_reg Tmp0,18 + declare_var_vector_reg Tmp1,19 + declare_var_vector_reg Ctr,1 + + +START_FUNC(init,128,_) +START_FUNC(init,192,_) +START_FUNC(init,256,_) + stp aad_len,xzr,[context,AAD_LEN_OFF] //save in_length and aad_length + str xzr,[context,PARTIAL_BLOCK_LENGTH_OFF] //clear partial_block_length + add hashkey_base,key_data,HASHKEY_BASE_OFF + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* Set orig_IV */ + ldr wtemp0,[iv,8] + ldr temp1,[iv] + movk temp0,0x100,lsl 48 + stp temp1,temp0,[context,ORIG_IV_OFF] + and left_len,aad_len,15 + ldp qHashKey0,qHashKey0Ext,[key_data,(HASHKEY_TOTAL_NUM-1)*32] + /* Set current_counter, save as cpu order */ + ldr qCtr,[context,ORIG_IV_OFF] + rev32 vCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + cbz aad_len,init_zero_exit + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,small_tbl_adr,Tmp0 + cbz aad_len,24f // aad_len less than 16 +2: + cbnz left_len,1f + /*left_len == 0 && aad_len !=0 */ + + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] + cbz aad_len,24f /* aad_len == 16 */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 +1: + /* loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f /* break loop */ + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz aad_len,23f /* left aad_len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + sub aad_len,aad_len,1 + + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + str qAadHash,[context] + ret + +23: + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + str qAadHash,[context] + ret +24: /* less or equal than 16 */ + rbit vLeftDat.16b, vLeftDat.16b + str qLeftDat,[context] + ret +init_zero_exit: + stp xzr,xzr,[context] + ret +END_FUNC(init,128,_) +END_FUNC(init,192,_) +END_FUNC(init,256,_) + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S new file mode 100644 index 000000000..c4e8ef59c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S @@ -0,0 +1,140 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + .section .rodata +#define CONST_VAR_START(a) \ + .align 3;.global a;.type a, %object;a + +#define CONST_VAR_END(a) \ + .size a,. - a +CONST_VAR_START(shift_small_data_table): + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +CONST_VAR_START(read_small_data_table): + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +CONST_VAR_END(shift_small_data_table) + .byte 0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff +CONST_VAR_START(write_small_data_table): + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff +CONST_VAR_END(read_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0x04,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0x0c,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff +CONST_VAR_START(read_end_small_data_table): + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff +CONST_VAR_END(write_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e + .byte 0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d +CONST_VAR_START(write_end_small_data_table): + .byte 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e +CONST_VAR_END(read_end_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff + .byte 0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff +CONST_VAR_START(tbx_end_small_data_table): + .byte 0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff +CONST_VAR_END(write_end_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f +CONST_VAR_START(tbx_start_small_data_table): + .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f +CONST_VAR_END(tbx_end_small_data_table) + .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f +CONST_VAR_END(tbx_start_small_data_table) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S new file mode 100644 index 000000000..9f1ff80fb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_enc_dec.S" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S new file mode 100644 index 000000000..f3cc2b802 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_enc_dec.S" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S new file mode 100644 index 000000000..e635d7e70 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_precomp.S" \ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S new file mode 100644 index 000000000..52b76a6a2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_precomp.S" \ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S new file mode 100644 index 000000000..42c48d9a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S @@ -0,0 +1,32 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_update.S" + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S new file mode 100644 index 000000000..1c2c33b48 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S @@ -0,0 +1,32 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_update.S" + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c new file mode 100644 index 000000000..1a2077356 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c @@ -0,0 +1,108 @@ +/********************************************************************** + Copyright(c) 2020-2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES); +} + +#define DEFINE_CBC_INTERFACE_DISPATCHER(func,mode,suffix) \ + DEFINE_INTERFACE_DISPATCHER(aes_cbc_##func##_##mode) \ + { \ + if (is_crypto_available()) \ + return PROVIDER_INFO(aes_cbc_##func##_##mode##_##suffix); \ + return PROVIDER_BASIC(aes_cbc_##func##_##mode); \ + } + +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 128, aes); +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 192, aes); +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 256, aes); + +/* + * AES-CBC decryption can be parallelised according to algorithm. Decryption + * flow is to do decrypt and then EOR previous input data or IV(first). + * So, decryption can be parallelised and EOR all data as output data. + * + * The unroll factor depends on micro architecture. The factors of N1, A57 and A72 + * are based on optimization guide and test results. Other platforms are based on + * ThunderX2 test results. + * + */ +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_128) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_128_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_128_aes_4); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_128_aes_6); + } + return PROVIDER_INFO(aes_cbc_dec_128_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_128); +} + +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_192) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_192_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_192_aes_5); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_192_aes_4); + } + return PROVIDER_INFO(aes_cbc_dec_192_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_192); +} + +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_256) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_256_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_256_aes_5); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_256_aes_6); + } + return PROVIDER_INFO(aes_cbc_dec_256_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_256); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S new file mode 100644 index 000000000..6f793843a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#define FN_NAME(fn,mode,post) aes_cbc_##fn##_##mode##_##post +#define LABEL_NAME(fn,mode,post) .L##fn##_##mode##_##post +#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \ + .type FN_NAME(fn,mode,post), %function; \ + FN_NAME(fn,mode,post): +#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post) +.macro declare_var_vector_reg name:req,reg:req +.ifdef q\name + .unreq q\name + .unreq v\name + .unreq s\name + .unreq d\name +.endif + .set q\name , \reg + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm \ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S new file mode 100644 index 000000000..11bd90a71 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S @@ -0,0 +1,482 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +#include "cbc_common.S" + .altmacro +.macro _aes_decrypt_round block:req,key:req + aesd v\block\().16b,vKey\key\().16b + .if \key < 13 + aesimc v\block\().16b,v\block\().16b + .endif + .if \key > 13 + .error "erro her" + .endif +.endm + +.macro aes_decrypt_round block,reg,key + _aes_decrypt_round In\reg\()_\block,\key +.endm + +.macro load_keys first_key + .if \first_key == 4 + ld1 {vKey4.4s -vKey6.4s},[keys],3*16 + .endif + .ifc 2 , \first_key + ldr qKey2,[keys],1*16 + ld1 {vKey3.16b -vKey6.16b},[keys],4*16 + .endif + .ifc 0 , \first_key + ld1 {vKey0.16b -vKey2.16b},[keys],3*16 + ld1 {vKey3.16b -vKey6.16b},[keys],4*16 + .endif + ld1 {vKey7.16b -vKey10.16b},[keys],4*16 + ld1 {vKey11.16b-vKey14.16b},[keys],4*16 +.endm + +.macro aes_decrypt_blocks_round blocks,key_idx,key_reg,next_keyreg,first_idx + .if \key_idx == 12 + ldr q\next_keyreg,[keys],(\first_idx-13)*16 + .else + ldr q\next_keyreg,[keys],16 + .endif + n=0 + .rept \blocks + _aes_decrypt_round %n,\key_reg + n=n+1 + .endr +.endm + +.macro aes_decrypt_rounds blocks,key_st,key_end,first_idx + j=key_st + .rept \key_end - \key_st + 1 + aes_decrypt_blocks_round \blocks,%j,%(j%2),%((j+1)%2),\first_idx + j=j+1 + .endr +.endm + +.macro aes_cbc_decrypt_rounds blocks,first_idx,reg,next_reg + aes_decrypt_rounds \blocks,\first_idx,12,\first_idx +.endm + +.macro declare_prefix idx,reg,prefix + declare_var_vector_reg \prefix\()\idx,\reg +.endm + +.macro mldr reg,block,addr + ldr qIn\reg\()_\block,[\addr],16 +.endm + +.macro mldrin reg,blocks,addr + .if \blocks == 1 + ldr qIn\reg\()_0,[\addr],16 + .exitm + .endif + .if \blocks == 2 + ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + .exitm + .endif + .if \blocks == 3 + ldr qIn\reg\()_0,[\addr],16 + ldp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16 + .exitm + .endif + .if \blocks == 4 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 5 + ldr qIn\reg\()_0,[\addr],16 + ld1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 6 + ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + ld1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 7 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16 + ld1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16 + .exitm + .endif + + .if \blocks == 8 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 9 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + ldr qIn\reg\()_8,[\addr],16 + .exitm + .endif +.endm + +.macro mstrout reg,blocks,addr + .if \blocks == 1 + str qIn\reg\()_0,[\addr],16 + .exitm + .endif + .if \blocks == 2 + stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + .exitm + .endif + .if \blocks == 3 + str qIn\reg\()_0,[\addr],16 + stp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16 + .exitm + .endif + .if \blocks == 4 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 5 + str qIn\reg\()_0,[\addr],16 + st1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 6 + stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + st1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 7 + st1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16 + st1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16 + .exitm + .endif + + .if \blocks == 8 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 9 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + str qIn\reg\()_8,[\addr],16 + .exitm + .endif +.endm + +.macro eorkey14 block,reg + eor vBlock\block\().16b,vKey14.16b,vState\reg\()_\block\().16b +.endm + +.macro eorblock block,reg + eor vIn\reg\()_\block\().16b,vBlock\block\().16b,vIn\reg\()_\block\().16b +.endm + +.macro movstate0 block,reg + mov vState\reg\()_0.16b,vIn\reg\()_\block\().16b +.endm + +.macro cbc_decrypt_rounds blocks,reg,first_key,cur_blocks + .ifb \cur_blocks + _blocks=\blocks + .else + _blocks=\cur_blocks + .endif + key=\first_key + 1 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key %key,\first_key + .endif + n=0 + .rept _blocks - 1 + eorkey14 %((n+1)%_blocks),\reg + aes_decrypt_round %n,\reg,\first_key + n=n+1 + .endr + eorkey14 0,\reg + movstate0 %(_blocks-1),\reg + aes_decrypt_round %n,\reg,\first_key + + k=0 + .rept 15-\first_key-3 + n=0 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key %(key+k+1),\first_key + .endif + + .rept _blocks + aes_decrypt_round %n,\reg,%(key+k) + n=n+1 + .endr + k=k+1 + .endr + n=0 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key \first_key,\first_key + .endif + .rept _blocks + aes_decrypt_round %n,\reg,13 + eorblock %n,\reg + n=n+1 + .endr +.endm + +.macro print_macro a,b,c,d,e + .print "print_macro,\a \b \c \d \e" +.endm + +.macro remainder_process blocks,first_key,curblk +.if \blocks > (1<<\curblk) + tbz xlen_remainder,\curblk,1f + mldrin 0,%(1<<\curblk),in + cbc_decrypt_rounds \blocks,0,\first_key,%(1<<\curblk) + mstrout 0,%(1<<\curblk),out +1: +.endif +.endm + +.macro aes_cbc_decrypt_blocks first_key,blocks + division \blocks, len_bytes,len_remainder,tmp0,tmp1 + mov xlen_quotient_in,xlen_quotient + /* + input regs(2*\block) + tmp regs(\blocks) + State reg(1) + + key regs(15-\first_key) < 32 + */ + .if 3*\blocks+1 < 32-15+\first_key + n=\first_key + .rept 15-\first_key + declare_prefix %n,%(n+17),Key + n=n+1 + .endr + load_keys \first_key + .else + n=\first_key + .rept 14-\first_key + declare_prefix %n,%((n%2)+29),Key + n=n+1 + .endr + declare_prefix 14,31,Key + /* load first key */ + ldr_key \first_key,\first_key + /* load last key */ + ldr_key 14,\first_key + .endif + m=\blocks + l=\blocks-1 + declare_prefix 0,0,State0_ + declare_prefix 0,0,State1_ + n=0 + .rept \blocks + declare_prefix %n,%(n+1),In0_ + declare_prefix %n,%(n+m+1),In1_ + declare_prefix %n,%(n+2*m+1),Block + n=n+1 + .endr + n=1 + .rept \blocks -1 + declare_prefix %n,%(n),State0_ + declare_prefix %n,%(n+m),State1_ + n=n+1 + .endr + ldr qState0_0,[IV] + cbz xlen_quotient,9f + mldrin 0,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 + b 5f + +3: + sub xlen_quotient,xlen_quotient,1 + mstrout 1,\blocks,out + cbz xlen_quotient,9f +5: + cbz xlen_quotient_in,1f + mldrin 1,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 +1: + cbc_decrypt_rounds \blocks,0,\first_key + sub xlen_quotient,xlen_quotient,1 + mstrout 0,\blocks,out + cbz xlen_quotient,9f + + cbz xlen_quotient_in,1f + mldrin 0,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 +1: + cbc_decrypt_rounds \blocks,1,\first_key + b 3b +9: + remainder_process \blocks,\first_key,3 + remainder_process \blocks,\first_key,2 + remainder_process \blocks,\first_key,1 + remainder_process \blocks,\first_key,0 +.endm + + +.macro division blocks,quotient,remainder,tmp0,tmp1 + .if \blocks == 1 + mov x\remainder, 0 + .exitm + .endif + .if \blocks == 2 + and x\remainder, x\quotient, 1 + lsr x\quotient, x\quotient, 1 + .exitm + .endif + .if \blocks == 3 + mov x\tmp0, -6148914691236517206 + mov x\remainder, x\quotient + movk x\tmp0, 0xaaab, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -2 + lsr x\quotient, x\tmp0, 1 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif + .if \blocks == 4 + and x\remainder, x\quotient, 3 + lsr x\quotient, x\quotient, 2 + .exitm + .endif + .if \blocks == 5 + mov x\tmp0, -3689348814741910324 + mov x\remainder, x\quotient + movk x\tmp0, 0xcccd, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -4 + lsr x\quotient, x\tmp0, 2 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif + .if \blocks == 6 + mov x\tmp0, -6148914691236517206 + mov x\tmp1, x\quotient + movk x\tmp0, 0xaaab, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + lsr x\quotient, x\tmp0, 2 + add x\remainder, x\quotient, x\quotient, lsl 1 + sub x\remainder, x\tmp1, x\remainder, lsl 1 + .exitm + .endif + .if \blocks == 7 + mov x\tmp0, 9363 + mov x\tmp1, x\quotient + movk x\tmp0, 0x9249, lsl 16 + movk x\tmp0, 0x4924, lsl 32 + movk x\tmp0, 0x2492, lsl 48 + umulh x\quotient, x\quotient, x\tmp0 + sub x\tmp0, x\tmp1, x\quotient + add x\tmp0, x\quotient, x\tmp0, lsr 1 + lsr x\quotient, x\tmp0, 2 + lsl x\remainder, x\quotient, 3 + sub x\remainder, x\remainder, x\quotient + sub x\remainder, x\tmp1, x\remainder + .exitm + .endif + .if \blocks == 8 + and x\remainder, x\quotient, 7 + lsr x\quotient, x\quotient, 3 + .exitm + .endif + .if \blocks == 9 + mov x\tmp0, 58255 + mov x\remainder, x\quotient + movk x\tmp0, 0x8e38, lsl 16 + movk x\tmp0, 0x38e3, lsl 32 + movk x\tmp0, 0xe38e, lsl 48 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -8 + lsr x\quotient, x\tmp0, 3 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif +.endm + +.macro ldr_key num,first_key + ldr qKey\num,[keys,16*(\num - \first_key)] +.endm +#ifndef CBC_DECRYPT_BLOCKS_NUM +#define CBC_DECRYPT_BLOCKS_NUM 8 +#endif + +.macro cbc_decrypt first_key:req,blocks + lsr xlen_bytes,xlen_bytes,4 + cbz xlen_bytes,10f + push_stack + aes_cbc_decrypt_blocks \first_key,\blocks + pop_stack +10: +.endm + +.set stack_size,64 +.macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] +.endm + +.macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size +.endm + +/* +void aes_cbc_dec_128( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); +*/ + declare_var_generic_reg in ,0 + declare_var_generic_reg IV ,1 + declare_var_generic_reg keys ,2 + declare_var_generic_reg out ,3 + declare_var_generic_reg len_bytes ,4 + declare_var_generic_reg len_quotient,4 + declare_var_generic_reg len_remainder,5 + declare_var_generic_reg tmp0 ,6 + declare_var_generic_reg tmp1 ,7 + declare_var_generic_reg len_quotient_in,6 + +.macro define_aes_cbc_dec_func mode:req,blocks:req + .global aes_cbc_dec_\mode\()_aes_\blocks +aes_cbc_dec_\mode\()_aes_\blocks: + cbc_decrypt %((256-mode)/32),\blocks + ret + .size aes_cbc_dec_\mode\()_aes_\blocks, . - aes_cbc_dec_\mode\()_aes_\blocks +.endm + +.irp blocks,1,2,3,4,5,6,7,8,9 + define_aes_cbc_dec_func 128,\blocks + define_aes_cbc_dec_func 192,\blocks + define_aes_cbc_dec_func 256,\blocks +.endr diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S new file mode 100644 index 000000000..8eb5e507d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S @@ -0,0 +1,157 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "cbc_common.S" + + declare_var_vector_reg Key0 ,17 + declare_var_vector_reg Key1 ,18 + declare_var_vector_reg Key2 ,19 + declare_var_vector_reg Key3 ,20 + declare_var_vector_reg Key4 ,21 + declare_var_vector_reg Key5 ,22 + declare_var_vector_reg Key6 ,23 + declare_var_vector_reg Key7 ,24 + declare_var_vector_reg Key8 ,25 + declare_var_vector_reg Key9 ,26 + declare_var_vector_reg Key10 ,27 + declare_var_vector_reg Key11 ,28 + declare_var_vector_reg Key12 ,29 + declare_var_vector_reg Key13 ,30 + declare_var_vector_reg Key14 ,31 + +.macro aes_encrypt_round block,key + aese v\block\().16b,vKey\key\().16b + .if \key < 13 + aesmc v\block\().16b,v\block\().16b + .endif +.endm + +.macro aes_encrypt_round_name block,key + aese v\block\().16b,v\key\().16b + aesmc v\block\().16b,v\block\().16b +.endm + + + +.set stack_size,64 +.macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] +.endm + +.macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size +.endm +/* +void aes_cbc_dec_128( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); +*/ + declare_var_generic_reg in ,0 + declare_var_generic_reg IV ,1 + declare_var_generic_reg keys ,2 + declare_var_generic_reg out ,3 + declare_var_generic_reg len_bytes ,4 + + declare_var_vector_reg State ,0 + declare_var_vector_reg FirstKey ,1 + declare_var_vector_reg Block ,2 + declare_var_vector_reg ConstKey ,3 +.macro load_key num + ldr qKey\num,[keys],16 +.endm +.altmacro +.macro cbc_encrypt first:req + lsr xlen_bytes,xlen_bytes,4 + cbz xlen_bytes,3f + ldr qState,[IV] + ldr qKey\first,[keys],16 + .set lastkey_off,13-\first + ldr qKey14,[keys,lastkey_off*16] + ldr qBlock,[in],16 + n=\first + second=1+\first + .rept 5-n + n=n+1 + load_key %n + .endr + ld1 {vKey6.4s - vKey9.4s},[keys],4*16 + eor vBlock.16b,vBlock.16b ,vState.16b + eor vConstKey.16b,vKey\first\().16b,vKey14.16b + aes_encrypt_round Block,\first + ld1 {vKey10.4s - vKey13.4s},[keys] + b 1f +2: + aes_encrypt_round Block,\first + str qState,[out],16 +1: + sub xlen_bytes,xlen_bytes,1 + aes_encrypt_round Block,%second + cbz xlen_bytes,1f + ldr qKey\first,[in],16 +1: + n=second + .rept 12-n + n=n+1 + aes_encrypt_round Block,%n + .endr + + eor vKey\first\().16b,vKey\first\().16b,vConstKey.16b + aes_encrypt_round Block,13 + eor vState.16b,vBlock.16b,vKey14.16b + cbnz xlen_bytes,2b + str qState,[out] +3: + +.endm +START_FUNC(enc,128,aes) + cbc_encrypt 4 + ret +END_FUNC(enc,128,aes) + +START_FUNC(enc,192,aes) + cbc_encrypt 2 + ret +END_FUNC(enc,192,aes) + +START_FUNC(enc,256,aes) + cbc_encrypt 0 + ret +END_FUNC(enc,256,aes) \ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S new file mode 100644 index 000000000..fba533754 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S @@ -0,0 +1,38 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_cbc_dec_128 +mbin_interface aes_cbc_dec_192 +mbin_interface aes_cbc_dec_256 + +mbin_interface aes_cbc_enc_128 +mbin_interface aes_cbc_enc_192 +mbin_interface aes_cbc_enc_256 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c new file mode 100644 index 000000000..f8188e3ae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c @@ -0,0 +1,255 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL)) == + (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL); +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_precomp_128_aes); + + return PROVIDER_BASIC(aes_gcm_precomp_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_precomp_256_aes); + + return PROVIDER_BASIC(aes_gcm_precomp_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_update_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_update_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_update_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_update_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_init_256_aes); + + return PROVIDER_BASIC(aes_gcm_init_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_init_128_aes); + + return PROVIDER_BASIC(aes_gcm_init_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_update_nt); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S new file mode 100644 index 000000000..042f6cf19 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S @@ -0,0 +1,430 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +#define HASHKEY_TOTAL_NUM (24) +#define HASHKEY_BASE_OFF (15*16) +#define HASHKEY_OFF(n) ((15*16)+n*32) +#define HASHKEY_EXT_OFF(n) ((15*16)+n*32+16) +#ifndef KEY_LEN +#define KEY_LEN 128 +#endif +#ifndef BLOCKS +#define BLOCKS 24 +#endif +#define FN_NAME(fn,mode,post) aes_gcm_##fn##_##mode####post##aes +#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \ + .type FN_NAME(fn,mode,post), %function; \ + FN_NAME(fn,mode,post): +#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post) + +#define AAD_LEN_OFF 16 +#define IN_LENGTH_OFF 24 +#define PARTIAL_BLOCK_ENC_KEY_OFF 32 +#define PARTIAL_BLOCK_LENGTH_OFF 80 +#define CTR_OFF 64 +#define ORIG_IV_OFF 48 +/* + [low,middle,tmp0,high] +=dat0 * [hashkey0,hashkey0_ext] + ifnb dat1 + dat1=rbit(*dat_adr) + [hashkey0,hashkey0_ext] = *hashkey_adr + dat_adr+=16 + hashkey_adr+=32 +*/ + +.macro ghash_mult_round aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \ + tmp0:req,tmp1:req,next_dat:req,left_count:req + + ldr q\next_dat,[\dat_adr],16 + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr,16] + .endif + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],32 + .endif + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + rbit v\aadhash\().16b, v\next_dat\().16b +.endm + +.macro ghash_mult_init_round aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, \ + high:req,low:req,middle:req,tmp0:req,next_dat:req,left_count:req + ldp q\hashkey0,q\hashkey0_ext,[\hashkey_adr],32 + ldr q\next_dat,[\dat_adr],16 + pmull v\middle\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr,16] + .endif + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],32 + .endif + rbit v\aadhash\().16b, v\next_dat\().16b +.endm + +/* aadhash=reduction(low,middle,high)+dat0 */ +.macro ghash_mult_final_round aadhash:req, \ + high:req,low:req,middle:req,tmp0:req, \ + zero:req,poly:req + + ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/ + ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low */ + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\middle\().16b + + pmull2 v\middle\().1q,v\high\().2d,v\poly\().2d + + ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/ + ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low*/ + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\middle\().16b + pmull v\middle\().1q,v\high\().1d,v\poly\().1d + eor v\tmp0\().16b, v\low\().16b, v\middle\().16b + eor v\aadhash\().16b, v\aadhash\().16b, v\tmp0\().16b +.endm +.macro ghash_reset_hashkey_addr hashkey_addr:req,hashkey_base:req,count:req + add \hashkey_addr,\hashkey_base,(24-\count)<<5 +.endm + + +.macro ghash_block_n count:req,aadhash:req, dat:req,dat_addr:req, hashkey_addr:req, hashkey_base:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \ + tmp0:req,tmp1:req + + ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count + ghash_mult_init_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \ + \high,\low,\middle,\tmp0,\dat,\count + .set left_count,\count - 1 + .rept left_count + ghash_mult_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \ + \high,\low,\middle,\tmp0,\tmp1,\dat, left_count + .set left_count,left_count - 1 + + .endr + ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly +.endm + +/* + aadhash=aadhash*[hashkey,hashkey_ext] + rbit(dat) +*/ +.macro ghash_block_reg aadhash:req, dat:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \ + tmp0:req + pmull v\middle\().1q,v\aadhash\().1d,v\hashkey_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey_ext\().2d + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d + rbit v\aadhash\().16b, v\dat\().16b + ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly +.endm + +.macro ghash_mult_round_noload aadhash:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \ + tmp0:req,tmp1:req + + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + +.endm + +/* aadhash=reduction([low,high],poly)+dat0 */ +.macro poly_mult_final_x2 aadhash:req, \ + high:req,low:req,tmp0:req,tmp1:req, \ + poly:req + pmull2 v\tmp1\().1q,v\high\().2d,v\poly\().2d + eor v\low\().16b, v\aadhash\().16b, v\low\().16b + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + ext v\tmp0\().16b,v\tmp1\().16b,v\aadhash\().16b,8 //high + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp1\().16b,8 //low + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\high\().1d,v\poly\().1d + eor v\aadhash\().16b, v\low\().16b, v\tmp1\().16b +.endm + +.macro aes_encrypt_round block,key + aese v\block\().16b,v\key\().16b + aesmc v\block\().16b,v\block\().16b +.endm + +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm + +/*Read data less than 16 */ +.macro read_small_data dest:req,src:req,size:req,tbl_adr:req,tbl:req + ldr q\tbl,[\tbl_adr,\size,lsl 4] + tbz \size,3,1f + ld1 {v\dest\().d}[0],[\src],8 +1: + tbz \size,2,1f + ld1 {v\dest\().s}[2],[\src],4 +1: + tbz \size,1,1f + ld1 {v\dest\().h}[6],[\src],2 +1: + tbz \size,0,1f + ld1 {v\dest\().b}[14],[\src],1 +1: + tbl v\dest\().16b,{v\dest\().16b},v\tbl\().16b +.endm +.macro read_small_data_start dest:req,src:req,size:req,tbl_adr:req,tbl:req + adrp \tbl_adr,:got:read_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_small_data_table] + read_small_data \dest,\src,\size,\tbl_adr,\tbl +.endm + +.macro read_small_data_end dest:req,src:req,size:req,tbl_adr:req,tbl:req + adrp \tbl_adr,:got:read_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_end_small_data_table] + read_small_data \dest,\src,\size,\tbl_adr,\tbl +.endm + +.macro write_small_data src:req,dest:req,size:req,tbl_adr:req,tmp1:req + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbl v\tmp1\().16b,{v\src\().16b},v\tmp1\().16b + tbz \size,3,1f + st1 {v\tmp1\().d}[0],[\dest],8 +1: + tbz \size,2,1f + st1 {v\tmp1\().s}[2],[\dest],4 +1: + tbz \size,1,1f + st1 {v\tmp1\().h}[6],[\dest],2 +1: + tbz \size,0,1f + st1 {v\tmp1\().b}[14],[\dest],1 +1: +.endm +.macro write_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:write_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_small_data_table] + write_small_data \src,\dest,\size,\tbl_adr,\tmp1 +.endm +.macro write_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:write_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_end_small_data_table] + write_small_data \src,\dest,\size,\tbl_adr,\tmp1 +.endm + +.macro tbx_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:tbx_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_end_small_data_table] + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b +.endm + +.macro tbx_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:tbx_start_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_start_small_data_table] + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b +.endm + + +.macro clear_small_data dest:req,zero:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:shift_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:shift_small_data_table] + add \tbl_adr,\tbl_adr,16 + sub \tbl_adr,\tbl_adr,\size + ldr q\tmp1,[\tbl_adr] + tbx v\dest\().16b,{v\zero\().16b},v\tmp1\().16b +.endm + + +.macro aes_gcm_n_round is_enc:req,count:req,aadhash:req, dat_addr:req, \ + hashkey_addr:req, hashkey_base:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req, poly:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req + + ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count + + aes_gcm_init \is_enc,\aadhash,\dat_addr,\hashkey_addr, \ + \hashkey,\hashkey_ext, \high,\low, \ + \ctr,\enc_ctr,\one,\out_adr, \ + \tmp0,\tmp1,\count + + .set left_count,\count - 1 + .rept left_count + aes_gcm_middle \is_enc,\aadhash,\dat_addr,\hashkey_addr, \ + \hashkey,\hashkey_ext, \high,\low, \ + \ctr,\enc_ctr,\one,\out_adr, \ + \tmp0,\tmp1, left_count + .set left_count,left_count - 1 + .endr + + poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\poly + +.endm + + +/* + aadhash=aadhash*[hashkey_base[(TOTAL_HASHKEY_NUM-2),(TOTAL_HASHKEY_NUM-1)]] + rbit(dat) +*/ +.macro ghash_block_reg_x2 aadhash:req, dat:req, hashkey_base:req, \ + hashkey:req,high:req,low:req,tmp0:req, tmp1:req, \ + tmp2:req,temp0:req + ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32+16] + eor v\tmp2\().16b,v\tmp2\().16b,v\tmp2\().16b,8 //zero + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey\().2d + ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32] + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + ext v\tmp0\().16b,v\tmp0\().16b,v\tmp2\().16b,8 /*high*/ + ext v\tmp1\().16b,v\tmp2\().16b,v\tmp0\().16b,8 /*low*/ + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d + mov temp0,0x87 + pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d + dup v\tmp2\().2d,x0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + rbit v\aadhash\().16b, v\dat\().16b + poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\tmp2 +.endm + +.macro __generic_load_small_data is_enc:req,len_bit:req,small_read_len:req, \ + in_adr:req,out_adr:req,partial_block:req,temp0:req,temp1:req,r:req,p + tbz \small_read_len,\len_bit,1f + ldr\p \r\()\temp0,[\in_adr],1<<\len_bit /*in */ + ldr\p \r\()\temp1,[\partial_block] /* partial*/ + eor \r\()\temp1,\r\()\temp0,\r\()\temp1 + .ifc \is_enc ,decrypt + str\p \r\()\temp0,[\partial_block],1<<\len_bit + .endif + .ifc \is_enc, encrypt + str\p \r\()\temp1,[\partial_block],1<<\len_bit + .endif + str\p \r\()\temp1,[\out_adr],1<<\len_bit +1: +.endm +.macro generic_load_partial_block is_enc:req,small_read_len:req,in_adr:req,out_adr:req, \ + partial_block:req,temp0:req,temp1:req + __generic_load_small_data \is_enc,3,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,x /* small_read_len >=8 */ + __generic_load_small_data \is_enc,2,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w /* small_read_len >=4 */ + __generic_load_small_data \is_enc,1,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,h /* small_read_len >=2 */ + __generic_load_small_data \is_enc,0,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,b /* small_read_len >=1 */ +.endm +/* without Neon read version */ +.macro generic_partial_block_start is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \ + partial_block:req,partial_block_len:req,small_read_len:req,left_partial_block_len:req, \ + temp0:req + mov \left_partial_block_len,16 + add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF + sub \left_partial_block_len,\left_partial_block_len,\partial_block_len + add \partial_block,\partial_block,\partial_block_len + cmp \in_len,\left_partial_block_len + csel \small_read_len,\in_len,\left_partial_block_len, ls + add \partial_block_len,\partial_block_len,\small_read_len + sub \in_len,\in_len,\small_read_len + and \partial_block_len,\partial_block_len,0xf + str \partial_block_len,[\context,PARTIAL_BLOCK_LENGTH_OFF] + generic_load_partial_block \is_enc,\small_read_len,\in_adr,\out_adr,\partial_block, \ + \left_partial_block_len,\temp0 /* small_read_len >=8 */ +.endm +.macro generic_paritial_block_end is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \ + partial_block:req,temp0:req,temp1:req + str \in_len,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF + generic_load_partial_block \is_enc,\in_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1 /* small_read_len >=8 */ +.endm +/*partial_block_len+in_len < 16,partial_block_len=0,in_len>0 */ +.macro paritial_block_small_length is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req,temp0:req,temp1:req,Ctr:req + + cbz 1f + ldr \temp0,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \temp1,\temp0,\in_len + str \temp1,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \context,\temp0,PARTIAL_BLOCK_ENC_KEY_OFF +2:/* loop start */ + sub \in_len,\in_len,1 + ldrb w\temp0,[\in_adr],1 + ldrb w\temp1,[\context] + eor w\temp1,w\temp1,w\temp0 + strb w\temp1,[\out_adr],1 +.ifc \is_enc , encrypt + strb w\temp1,[\context],1 +.endif +.ifc \is_enc,decrypt + strb w\temp0,[\context],1 +.endif + cbnz \in_len,2b +1:/* loop end */ +.endm + +/* 0 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + aes_encrypt_round \enc_ctr,Key1 + aes_encrypt_round \enc_ctr,Key2 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key3 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 + aes_encrypt_round \enc_ctr,Key4 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key5 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key6 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aese v\enc_ctr\().16b,vKey9.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc, encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + +.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + ldr q\hashkey0,[\hashkey_adr],16 + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + ldr q\hashkey0_ext,[\hashkey_adr],16 + aes_encrypt_round \enc_ctr,Key1 + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key2 + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key3 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + + aes_encrypt_round \enc_ctr,Key4 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 //low + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 //high + aes_encrypt_round \enc_ctr,Key5 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key6 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aese v\enc_ctr\().16b,vKey9.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S new file mode 100644 index 000000000..fb6a6e94d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S @@ -0,0 +1,181 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define KEY_LEN 256 +#include "gcm_common.S" + declare_var_vector_reg Key11,27 + declare_var_vector_reg Key12,28 + declare_var_vector_reg Key13,29 + declare_var_vector_reg Key14,30 +#define KEY_REGS 0,1,2,3,4,5,6,7,8,9,10,11,12 +.macro aes_encrypt_block block:req + aes_encrypt_round \block,Key0 + aes_encrypt_round \block,Key1 + aes_encrypt_round \block,Key2 + aes_encrypt_round \block,Key3 + aes_encrypt_round \block,Key4 + aes_encrypt_round \block,Key5 + aes_encrypt_round \block,Key6 + aes_encrypt_round \block,Key7 + aes_encrypt_round \block,Key8 + aes_encrypt_round \block,Key9 + aes_encrypt_round \block,Key10 + aes_encrypt_round \block,Key11 + aes_encrypt_round \block,Key12 + aese v\block\().16b,vKey13.16b + eor v\block\().16b,v\block\().16b,vKey14.16b +.endm + +/* + Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1] + */ +.macro load_aes_keys key_addr:req + ld1 { vKey0.4s- vKey3.4s},[\key_addr],64 + ld1 { vKey4.4s- vKey7.4s},[\key_addr],64 + ld1 { vKey8.4s- vKey11.4s},[\key_addr],64 + ld1 {vKey12.4s- vKey14.4s},[\key_addr],48 +.endm + + + +/* + [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext] + dat=*dat_adr + enc_dat=aes_encrypt(ctr)^dat + aadhash=rbit(enc_dat) + [hashkey0,hashkey0_ext] = *hashkey_adr + dat_adr+=16 + hashkey_adr+=32 +*/ +.macro aes_gcm_middle is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + aes_encrypt_round \enc_ctr,Key1 + aes_encrypt_round \enc_ctr,Key2 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key3 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 + aes_encrypt_round \enc_ctr,Key4 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key5 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key6 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aes_encrypt_round \enc_ctr,Key9 + aes_encrypt_round \enc_ctr,Key10 + aes_encrypt_round \enc_ctr,Key11 + aes_encrypt_round \enc_ctr,Key12 + aese v\enc_ctr\().16b,vKey13.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + +.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + ldr q\hashkey0,[\hashkey_adr],16 + add v\ctr\().4s,v\ctr\().4s,v\one\().4s /*increase ctr */ + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + ldr q\hashkey0_ext,[\hashkey_adr],16 + aes_encrypt_round \enc_ctr,Key1 + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key2 + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key3 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + + aes_encrypt_round \enc_ctr,Key4 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 /*low */ + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 /* high */ + aes_encrypt_round \enc_ctr,Key5 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key6 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aes_encrypt_round \enc_ctr,Key9 + aes_encrypt_round \enc_ctr,Key10 + aes_encrypt_round \enc_ctr,Key11 + aes_encrypt_round \enc_ctr,Key12 + aese v\enc_ctr\().16b,vKey13.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S new file mode 100644 index 000000000..927179cfc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S @@ -0,0 +1,588 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void gist_aes_gcm_dec_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *out, \ + uint8_t const *in, \ + uint64_t len, \ + uint8_t *iv, \ + \ + uint8_t const *aad, \ + uint64_t aad_len, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + \ + ) + */ + + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg out ,2 + declare_var_generic_reg in ,3 + declare_var_generic_reg len ,4 + declare_var_generic_reg iv ,5 + declare_var_generic_reg aad ,6 + declare_var_generic_reg aad_len ,7 + + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,5 + declare_var_generic_reg left_len ,12 + declare_var_generic_reg aad_left ,13 + declare_var_generic_reg temp0 ,14 + declare_var_generic_reg temp1 ,15 + + declare_var_generic_reg auth_tag ,0 /* input param */ + declare_var_generic_reg auth_tag_len,1 /* input param */ + + + declare_var_vector_reg Ctr,0 + declare_var_vector_reg AadHash,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Low,5 + declare_var_vector_reg EncCtr,6 + declare_var_vector_reg Dat0,6 + declare_var_vector_reg Middle0,7 + + declare_var_vector_reg Tmp0,8 + declare_var_vector_reg Tmp1,9 + declare_var_vector_reg Zero,10 + declare_var_vector_reg Poly,11 + declare_var_vector_reg LeftDat ,12 + declare_var_vector_reg Len ,13 + declare_var_vector_reg Tmp2,14 + declare_var_vector_reg Tmp3,15 + + declare_var_vector_reg One,31 + .set stack_size,64 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size + .endm + +START_FUNC(enc,KEY_LEN,_) +START_FUNC(enc,KEY_LEN,_nt_) + push_stack + /*save in_length and aad_length*/ + stp aad_len,len,[context,AAD_LEN_OFF] + load_aes_keys key_data + /* Init Consts and IV */ + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + ld1 {vCtr.d}[0],[iv],8 + eor vZero.16b,vZero.16b,vZero.16b + ld1 {vCtr.s}[2],[iv] + mov temp0,0x87 + rev32 vCtr.16b,vCtr.16b /* to cpu order */ + ins vOne.s[3],wtemp1 + mov vAadHash.16b,vZero.16b + dup vPoly.2d,temp0 + ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */ + + + and left_len,aad_len,0xf + cbz aad_len,24f + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f /* aad_len >= 16,skip */ + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0 + cbnz left_len,1f /* aad_len & 0xf != 0 */ +2: + cbz aad_len,1f /* aad_len <16 skip*/ + /* left_len == 0 && aad_len !=0 */ + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] +1: + cbnz aad_len,1f /* aad_len >16,skip */ + rbit vAadHash.16b,vLeftDat.16b + b 24f /* aad_len <=16, skip aadhash caculate */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 + +1: + /* loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbnz aad_len,1f /* left aad_len >32,skip */ + ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + b 24f /* left aad_len <=32,skip below check */ +1: + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ + sub aad_len,aad_len,1 + +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + +24: + + /* Enc/Dec loop */ + and left_len,len,15 + cbz len,24f + lsr len,len,4 +1: + /* loop aes gcm enc/dec loop */ + cmp len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub len,len,HASHKEY_TOTAL_NUM + aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz len,24f /* left len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM + sub temp0,temp0,len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + sub len,len,1 + aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz len,2f + sub len,len,1 +1: + + cbz len,1f + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + sub len,len,1 + b 1b +1: + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */ +2: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +24: + /* complete part */ + cmp left_len,0 + movi vHigh.16b,0 + mov temp0,HASHKEY_TOTAL_NUM-3 + movi vLow.16b,0 + cinc hashkey_addr,temp0,eq + movi vMiddle0.16b,0 + add hashkey_addr,hashkey_base,hashkey_addr,lsl 5 + ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32 + beq 2f + read_small_data_start LeftDat,in,left_len,temp0,Tmp0 + add vCtr.4s,vCtr.4s,vOne.4s + rev32 vEncCtr.16b,vCtr.16b + aes_encrypt_round EncCtr,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round EncCtr,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round EncCtr,Key2 + ldr qHashKey0,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key3 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round EncCtr,Key4 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round EncCtr,Key5 + ldr qHashKey0Ext,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key6 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round EncCtr,Key7 + aes_encrypt_round EncCtr,Key8 +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 + aes_encrypt_round EncCtr,Key10 + aes_encrypt_round EncCtr,Key11 + aes_encrypt_round EncCtr,Key12 + aese vEncCtr.16b,vKey13.16b + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b +#else + aese vEncCtr.16b,vKey9.16b + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b +#endif + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b + write_small_data_start EncCtr,out,left_len,temp0,Tmp0 + clear_small_data EncCtr,Zero,left_len,temp0,Tmp0 + rbit vAadHash.16b,vEncCtr.16b +2: + + ldr qLen,[context,AAD_LEN_OFF] /* Len */ + mov wtemp0,1 /* Ek */ + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */ + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */ + rev64 vLen.16b,vLen.16b /* Len */ + ins vCtr.4s[3],wtemp0 /* Ek */ + ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */ + rev32 vEncCtr.16b,vCtr.16b /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */ + rbit vAadHash.16b,vLen.16b /* Len */ + + aes_encrypt_round EncCtr,Key0 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key1 /* Ek */ + ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key2 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key3 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key4 /* Ek */ + + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key5 /* Ek */ + aes_encrypt_round EncCtr,Key6 /* Ek */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key7 /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key8 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */ +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 /* Ek */ + aes_encrypt_round EncCtr,Key10 /* Ek */ + aes_encrypt_round EncCtr,Key11 /* Ek */ + aes_encrypt_round EncCtr,Key12 /* Ek */ + aese vEncCtr.16b,vKey13.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */ +#else + aese vEncCtr.16b,vKey9.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */ +#endif + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */ + rbit vAadHash.16b,vEncCtr.16b /* Aad */ + + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */ + rbit vAadHash.16b,vAadHash.16b /* Aad */ + + + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret +END_FUNC(enc,KEY_LEN,_) +END_FUNC(enc,KEY_LEN,_nt_) + + +START_FUNC(dec,KEY_LEN,_) +START_FUNC(dec,KEY_LEN,_nt_) + push_stack + /* save in_length and aad_length */ + stp aad_len,len,[context,AAD_LEN_OFF] + load_aes_keys key_data + /* Init Consts and IV */ + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + ld1 {vCtr.d}[0],[iv],8 + eor vZero.16b,vZero.16b,vZero.16b + ld1 {vCtr.s}[2],[iv] + mov temp0,0x87 + rev32 vCtr.16b,vCtr.16b /* to cpu order */ + mov vAadHash.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */ + + ldp qHashKey0,qHashKey0Ext,[hashkey_base] + and left_len,aad_len,0xf + cbz aad_len,24f + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f /* aad_len >= 16,skip */ + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0 + cbnz left_len,1f /* aad_len & 0xf != 0 */ +2: + cbz aad_len,1f /* aad_len <16 skip */ + /* left_len == 0 && aad_len !=0 */ + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] +1: + cbnz aad_len,1f /* aad_len >16,skip */ + rbit vAadHash.16b,vLeftDat.16b + b 24f /* aad_len <=16, skip aadhash caculate */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 + +1: + /** loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f /* break loop */ + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbnz aad_len,1f /* left aad_len >32,skip */ + ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + b 24f /* left aad_len <=32,skip below check */ +1: + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ + sub aad_len,aad_len,1 + +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + +24: + + + /* Enc/Dec loop */ + and left_len,len,15 + cbz len,24f + lsr len,len,4 +1: + /* loop aes gcm enc/dec loop */ + cmp len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub len,len,HASHKEY_TOTAL_NUM + aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz len,24f /* left len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM + sub temp0,temp0,len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + sub len,len,1 + aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz len,2f + sub len,len,1 +1: + + cbz len,1f + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + sub len,len,1 + b 1b +1: + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */ +2: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +24: + /* complete part */ + cmp left_len,0 + movi vHigh.16b,0 + mov temp0,21 + movi vLow.16b,0 + cinc hashkey_addr,temp0,eq + movi vMiddle0.16b,0 + add hashkey_addr,hashkey_base,hashkey_addr,lsl 5 + ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32 + beq 2f + read_small_data_start LeftDat,in,left_len,temp0,Tmp0 + add vCtr.4s,vCtr.4s,vOne.4s + rev32 vEncCtr.16b,vCtr.16b + aes_encrypt_round EncCtr,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round EncCtr,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round EncCtr,Key2 + ldr qHashKey0,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key3 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round EncCtr,Key4 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round EncCtr,Key5 + ldr qHashKey0Ext,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key6 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round EncCtr,Key7 + aes_encrypt_round EncCtr,Key8 +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 + aes_encrypt_round EncCtr,Key10 + aes_encrypt_round EncCtr,Key11 + aes_encrypt_round EncCtr,Key12 + aese vEncCtr.16b,vKey13.16b + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b +#endif +#if KEY_LEN==128 + aese vEncCtr.16b,vKey9.16b + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b +#endif + write_small_data_start EncCtr,out,left_len,temp0,Tmp0 + rbit vAadHash.16b,vLeftDat.16b + +2: + + ldr qLen,[context,AAD_LEN_OFF] /* Len */ + mov wtemp0,1 /* Ek */ + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */ + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */ + rev64 vLen.16b,vLen.16b /* Len */ + ins vCtr.4s[3],wtemp0 /* Ek */ + ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */ + rev32 vEncCtr.16b,vCtr.16b /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */ + rbit vAadHash.16b,vLen.16b /* Len */ + + aes_encrypt_round EncCtr,Key0 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key1 /* Ek */ + ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key2 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key3 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key4 /* Ek */ + + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key5 /* Ek */ + aes_encrypt_round EncCtr,Key6 /* Ek */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key7 /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key8 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */ +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 /* Ek */ + aes_encrypt_round EncCtr,Key10 /* Ek */ + aes_encrypt_round EncCtr,Key11 /* Ek */ + aes_encrypt_round EncCtr,Key12 /* Ek */ + aese vEncCtr.16b,vKey13.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */ +#else + aese vEncCtr.16b,vKey9.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */ +#endif + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */ + rbit vAadHash.16b,vEncCtr.16b /* Aad */ + + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */ + rbit vAadHash.16b,vAadHash.16b /* Aad */ + + + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret +END_FUNC(dec,KEY_LEN,_) +END_FUNC(dec,KEY_LEN,_nt_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S new file mode 100644 index 000000000..b5433a1df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S @@ -0,0 +1,58 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_gcm_enc_128 +mbin_interface aes_gcm_dec_128 +mbin_interface aes_gcm_precomp_128 +mbin_interface aes_gcm_enc_256 +mbin_interface aes_gcm_dec_256 +mbin_interface aes_gcm_precomp_256 + + +mbin_interface aes_gcm_enc_128_update +mbin_interface aes_gcm_enc_128_finalize +mbin_interface aes_gcm_dec_128_update +mbin_interface aes_gcm_dec_128_finalize +mbin_interface aes_gcm_enc_256_update +mbin_interface aes_gcm_enc_256_finalize +mbin_interface aes_gcm_dec_256_update +mbin_interface aes_gcm_dec_256_finalize + +mbin_interface aes_gcm_init_256 +mbin_interface aes_gcm_init_128 +mbin_interface aes_gcm_enc_128_nt +mbin_interface aes_gcm_enc_128_update_nt +mbin_interface aes_gcm_dec_128_nt +mbin_interface aes_gcm_dec_128_update_nt +mbin_interface aes_gcm_enc_256_nt +mbin_interface aes_gcm_enc_256_update_nt +mbin_interface aes_gcm_dec_256_nt +mbin_interface aes_gcm_dec_256_update_nt diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S new file mode 100644 index 000000000..e555c9798 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S @@ -0,0 +1,83 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void aes_gcm_precomp(struct gcm_key_data *key_data); +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg temp0 ,1 + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,1 + + declare_var_vector_reg Low ,0 + declare_var_vector_reg Middle0 ,1 + declare_var_vector_reg Middle1 ,2 + declare_var_vector_reg High ,3 + declare_var_vector_reg HashKeyIter ,4 + declare_var_vector_reg HashKey ,5 + declare_var_vector_reg HashKeyExt ,6 + declare_var_vector_reg Poly ,7 + declare_var_vector_reg Zero ,31 + +START_FUNC(precomp,KEY_LEN,_) + load_aes_keys key_data + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + eor vHashKey.16b,vHashKey.16b,vHashKey.16b + dup vPoly.2d,temp0 + aes_encrypt_block HashKey + add hashkey_addr,hashkey_base,(HASHKEY_TOTAL_NUM-1)*32 + rbit vHashKey.16b,vHashKey.16b + ext vHashKeyExt.16b,vHashKey.16b,vHashKey.16b,8 + mov vHashKeyIter.16b,vHashKey.16b + stp qHashKey,qHashKeyExt,[hashkey_addr],-32 + +1: + pmull vMiddle0.1q,vHashKeyIter.1d,vHashKeyExt.1d + pmull2 vMiddle1.1q,vHashKeyIter.2d,vHashKeyExt.2d + pmull vLow.1q ,vHashKeyIter.1d,vHashKey.1d + eor vMiddle0.16b,vMiddle0.16b,vMiddle1.16b + pmull2 vHigh.1q ,vHashKeyIter.2d,vHashKey.2d + ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high + ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low + eor vHigh.16b ,vHigh.16b,vMiddle1.16b + eor vLow.16b ,vLow.16b ,vMiddle0.16b + pmull2 vMiddle0.1q ,vHigh.2d ,vPoly.2d + ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high + ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low + eor vHigh.16b ,vHigh.16b,vMiddle1.16b + eor vLow.16b ,vLow.16b ,vMiddle0.16b + pmull vMiddle0.1q ,vHigh.1d ,vPoly.1d + eor vHashKeyIter.16b,vLow.16b,vMiddle0.16b + ext vLow.16b,vHashKeyIter.16b,vHashKeyIter.16b,8 + stp qHashKeyIter,qLow,[hashkey_addr],-32 + cmp hashkey_addr,hashkey_base + bcs 1b + + ret +END_FUNC(precomp,KEY_LEN,_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S new file mode 100644 index 000000000..d47c52212 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S @@ -0,0 +1,277 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void gist_aes_gcm_dec_update_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *out, \ + const uint8_t *in, \ + uint64_t len \ + ) + */ + + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg out ,2 + declare_var_generic_reg in ,3 + declare_var_generic_reg len ,4 + declare_var_generic_reg partial_block_length,5 + declare_var_generic_reg blocks ,5 + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,6 + declare_var_generic_reg temp0 ,14 + declare_var_generic_reg temp1 ,15 + declare_var_generic_reg temp2 ,13 + + + + declare_var_vector_reg Ctr,0 + declare_var_vector_reg AadHash,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Low,5 + declare_var_vector_reg EncCtr,6 + declare_var_vector_reg Middle,7 + + declare_var_vector_reg Tmp0,8 + declare_var_vector_reg Tmp1,9 + declare_var_vector_reg Zero,10 + declare_var_vector_reg Poly,11 + declare_var_vector_reg PartialBlock ,12 + declare_var_vector_reg One,31 + .set stack_size,48 + .macro push_stack + stp d8, d9, [sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm +/* + 20:exit_without_popstack + 21:start_of_mainloop + 22:exit_with_popstack + 23:partial_block_start + */ +START_FUNC(enc,KEY_LEN,_update_) +START_FUNC(enc,KEY_LEN,_update_nt_) + ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */ + ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF] + ldr qAadHash,[context] + cbz len,20f /** if(len==0)return; exit_without_popstack*/ + push_stack + add temp0,temp0,len /* temp0=temp0+len */ + load_aes_keys key_data + str temp0,[context,IN_LENGTH_OFF] /* save in_length */ + /* Init Consts and IV */ + ldr qCtr,[context,CTR_OFF] + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/ +21: /* start_of_mainloop */ + cbz len,24f + lsr blocks,len,4 + cmp blocks,HASHKEY_TOTAL_NUM - 1 + and len,len,0xf + /* loop aes gcm enc/dec loop */ + bls 2f /* skip loop */ +1: + sub blocks,blocks,HASHKEY_TOTAL_NUM + cmp blocks,HASHKEY_TOTAL_NUM - 1 + aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + bhi 1b /* back to loop start */ +2: + cbz blocks,4f // left blocks == 0 + /* -(blocks - HASHKEY_TOTAL_NUM) */ + sub temp0,blocks,HASHKEY_TOTAL_NUM + neg temp0,temp0 + sub blocks,blocks,1 + add hashkey_addr,hashkey_base,temp0,lsl 5 + + aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz blocks,3f /* origin_blocks == 1 */ + sub blocks,blocks,1 + + cbz blocks,2f /* origin_blocks == 2 */ +1: + sub blocks,blocks,1 + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbnz blocks,1b +2: + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */ +3: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +4: + str qAadHash,[context] + str qCtr,[context,CTR_OFF] + cbnz len,24f +22: /* exit_with_popstack */ + pop_stack +20: /* exit_without_popstack */ + ret +23: /* partial_block_start */ + + generic_partial_block_start encrypt,len,in,out,context, \ + temp2,partial_block_length,temp0,temp1,hashkey_addr + cbnz partial_block_length,22b + ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16] + ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \ + High,Low,Middle,Zero,Poly,Tmp0 + str qAadHash,[context] + cbz len,4b + cmp len,15 + bhi 21b +24: /*partial_block_end */ + add vCtr.4s,vCtr.4s,vOne.4s + read_small_data_start PartialBlock,in,len,temp0,Tmp0 + rev32 vEncCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + aes_encrypt_block EncCtr + eor vPartialBlock.16b,vPartialBlock.16b,vEncCtr.16b + str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + write_small_data_start PartialBlock,out,len,temp0,Tmp0 + str len,[context,PARTIAL_BLOCK_LENGTH_OFF] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_update_) +END_FUNC(enc,KEY_LEN,_update_nt_) + + +START_FUNC(dec,KEY_LEN,_update_) +START_FUNC(dec,KEY_LEN,_update_nt_) + ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */ + ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF] + ldr qAadHash,[context] + cbz len,20f /** if(len==0)return; exit_without_popstack*/ + push_stack + add temp0,temp0,len /* temp0=temp0+len */ + load_aes_keys key_data + str temp0,[context,IN_LENGTH_OFF] /* save in_length */ + /* Init Consts and IV */ + ldr qCtr,[context,CTR_OFF] + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/ +21: /* start_of_mainloop */ + cbz len,24f + lsr blocks,len,4 + cmp blocks,HASHKEY_TOTAL_NUM - 1 + and len,len,0xf + /** loop aes gcm enc/dec loop */ + bls 2f /* skip loop */ +1: + sub blocks,blocks,HASHKEY_TOTAL_NUM + cmp blocks,HASHKEY_TOTAL_NUM - 1 + aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + bhi 1b /* back to loop start */ +2: + cbz blocks,4f /* left blocks == 0 */ + /* -(blocks - HASHKEY_TOTAL_NUM) */ + sub temp0,blocks,HASHKEY_TOTAL_NUM + neg temp0,temp0 + sub blocks,blocks,1 + add hashkey_addr,hashkey_base,temp0,lsl 5 + + aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 // load next hash + cbz blocks,3f /* origin_blocks == 1 */ + sub blocks,blocks,1 + + cbz blocks,2f /* origin_blocks == 2 */ +1: + sub blocks,blocks,1 + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbnz blocks,1b +2: + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */ +3: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +4: + str qAadHash,[context] + str qCtr,[context,CTR_OFF] + cbnz len,24f +22: /* exit_with_popstack */ + pop_stack +20: /* exit_without_popstack */ + ret +23: /* partial_block_start */ + + generic_partial_block_start decrypt,len,in,out,context, \ + temp2,partial_block_length,temp0,temp1,hashkey_addr + cbnz partial_block_length,22b + ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16] + ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \ + High,Low,Middle,Zero,Poly,Tmp0 + str qAadHash,[context] + cbz len,4b + cmp len,15 + bhi 21b +24: /* partial_block_end */ + add vCtr.4s,vCtr.4s,vOne.4s + read_small_data_start PartialBlock,in,len,temp0,Tmp0 + rev32 vEncCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + aes_encrypt_block EncCtr + eor vEncCtr.16b,vPartialBlock.16b,vEncCtr.16b + tbx_small_data_start EncCtr,PartialBlock,len,temp0,Tmp0 + write_small_data_start EncCtr,out,len,temp0,Tmp0 + str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + str len,[context,PARTIAL_BLOCK_LENGTH_OFF] + pop_stack + ret +END_FUNC(dec,KEY_LEN,_update_) +END_FUNC(dec,KEY_LEN,_update_nt_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S new file mode 100644 index 000000000..4a3e990c3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S @@ -0,0 +1,134 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.arch armv8-a+crypto + + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_128 off:req,rcon:req + .if \off == 0 + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp0,w_tmp1,[key] + movi vzero.4s,0 + dup vsrc.4s,w_tmp3 + stp w_tmp2,w_tmp3,[exp_key_enc,8] + stp w_tmp0,w_tmp1,[exp_key_enc] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp4,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp4,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*\off+KEY_LEN] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*\off+8+KEY_LEN] + .if \off != 10 + dup vsrc.4s,w_tmp3 + .endif +.endm +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + ldr q0,[\enc_key,(1+8)*16] + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + aesimc v0.16b,v0.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + str q0,[\dec_key,(\rounds-1-9)*16] +.endm +/** + void aes_keyexp_128_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (128/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp4 .req w7 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_128_aes + .type aes_keyexp_128_aes, %function + +aes_keyexp_128_aes: + .set rcon,1 + .set off,0 + .rept 10 + round_128 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + + export_dec_key NUM_ROUNDS(128),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_128_aes, .-aes_keyexp_128_aes + .global aes_keyexp_128_enc_aes + .type aes_keyexp_128_enc_aes, %function +aes_keyexp_128_enc_aes: + .set rcon,1 + .set off,0 + .rept 10 + round_128 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + ret + .size aes_keyexp_128_enc_aes, .-aes_keyexp_128_enc_aes \ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S new file mode 100644 index 000000000..2ba46060c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S @@ -0,0 +1,136 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_192 off:req,rcon:req + .if \off == 0 + ldp w_tmp0,w_tmp1,[key] + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp4,w_tmp5,[key,16] + movi vzero.4s,0 + dup vsrc.4s,w_tmp5 + stp w_tmp0,w_tmp1,[exp_key_enc] + stp w_tmp4,w_tmp5,[exp_key_enc,16] + stp w_tmp2,w_tmp3,[exp_key_enc,8] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + .if \off < 7 + eor w_tmp4,w_tmp4,w_tmp3 + eor w_tmp5,w_tmp5,w_tmp4 + dup vsrc.4s,w_tmp5 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16] + .else + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + .endif +.endm + +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + ldp q0,q1,[\enc_key,(1+8)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + ldr q2,[\enc_key,(1+10)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + aesimc v2.16b,v2.16b + stp q1,q0,[\dec_key,(\rounds-1-10)*16] + str q2,[\dec_key,(\rounds-1-11)*16] +.endm +/** + void aes_keyexp_192_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (192/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp .req w7 + w_tmp4 .req w9 + w_tmp5 .req w10 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_192_aes + .type aes_keyexp_192_aes, %function + +aes_keyexp_192_aes: + .set rcon,1 + .set off,0 + .rept 8 + round_192 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + export_dec_key NUM_ROUNDS(192),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_192_aes, .-aes_keyexp_192_aes + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S new file mode 100644 index 000000000..5433b2ff6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S @@ -0,0 +1,153 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_256 off:req,rcon:req,export_dec_key + .if \off == 0 + ldp w_tmp6,w_tmp7,[key,24] + ldp w_tmp0,w_tmp1,[key] + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp4,w_tmp5,[key,16] + movi vzero.4s,0 + dup vsrc.4s,w_tmp7 + stp w_tmp6,w_tmp7,[exp_key_enc,24] + stp w_tmp0,w_tmp1,[exp_key_enc] + stp w_tmp4,w_tmp5,[exp_key_enc,16] + stp w_tmp2,w_tmp3,[exp_key_enc,8] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + .if \off < 6 + dup vsrc.4s,w_tmp3 + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp4,w_tmp4,w_tmp + eor w_tmp5,w_tmp5,w_tmp4 + eor w_tmp6,w_tmp6,w_tmp5 + eor w_tmp7,w_tmp7,w_tmp6 + dup vsrc.4s,w_tmp7 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16] + stp w_tmp6,w_tmp7,[exp_key_enc,KEY_LEN*(\off+1)+24] + .else + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + .endif +.endm + +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + ldp q0,q1,[\enc_key,(1+8)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + ldp q2,q3,[\enc_key,(1+10)*16] + + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + ldr q4,[\enc_key,(1+12)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-10)*16] + aesimc v4.16b,v4.16b + stp q3,q2,[\dec_key,(\rounds-1-12)*16] + str q4,[\dec_key,(\rounds-1-13)*16] +.endm +/** + void aes_keyexp_256_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (256/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp .req w7 + w_tmp4 .req w9 + w_tmp5 .req w10 + w_tmp6 .req w11 + w_tmp7 .req w12 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_256_aes + .type aes_keyexp_256_aes, %function + +aes_keyexp_256_aes: + .set rcon,1 + .set off,0 + .rept 7 + round_256 off,rcon,1 + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + export_dec_key NUM_ROUNDS(256),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_256_aes, .-aes_keyexp_256_aes + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c new file mode 100644 index 000000000..14c9889ac --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c @@ -0,0 +1,72 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_128_aes); + + return PROVIDER_BASIC(aes_keyexp_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128_enc) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_128_enc_aes); + + return PROVIDER_BASIC(aes_keyexp_128_enc); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_192) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_192_aes); + + return PROVIDER_BASIC(aes_keyexp_192); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_256) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_256_aes); + + return PROVIDER_BASIC(aes_keyexp_256); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S new file mode 100644 index 000000000..aa7c32576 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_keyexp_128 +mbin_interface aes_keyexp_128_enc +mbin_interface aes_keyexp_192 +mbin_interface aes_keyexp_256 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c new file mode 100644 index 000000000..6c918858e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c @@ -0,0 +1,102 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_enc_ce); + } + return PROVIDER_BASIC(XTS_AES_128_enc); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_dec_ce); + } + return PROVIDER_BASIC(XTS_AES_128_dec); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_enc_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_128_enc_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_dec_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_128_dec_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_enc_ce); + } + return PROVIDER_BASIC(XTS_AES_256_enc); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_dec_ce); + } + return PROVIDER_BASIC(XTS_AES_256_dec); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_enc_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_256_enc_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_dec_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_256_dec_expanded_key); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S new file mode 100644 index 000000000..318c1e8a4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S @@ -0,0 +1,214 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.altmacro +.macro aes_key_expand_next next:req,prev:req,ctx:req + .if \next == 9 + mov \ctx, 0x1b + .endif + dup vdest.4s,vKey\prev\().s[3] + ext vtmp.16b,vzero.16b,vKey\prev\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\next\().16b,vKey\prev\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\next\().16b,vKey\next\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + mov tmpw,vdest.s[0] + eor tmpw,\ctx,tmpw,ror 8 + dup vdest.4s,tmpw + eor vKey\next\().16b,vKey\next\().16b,vtmp.16b + mov \ctx,ctx,lsl 1 + eor vKey\next\().16b,vKey\next\().16b,vdest.16b +.endm + +/* when loadin key = 0 + * arg1 = input key + * arg2 = rcon ctx register (optional) + * when loading key > 0 + * arg1 = rcon ctx register (optional) + */ +.macro aes_key_expand key:req,arg1,arg2 + .if \key == 0 + ld1 {vKey\key\().4s},[\arg1] + movi vzero.4s, 0 + .ifb \arg2 + mov rcon,#0x01 + .endif + + .ifnb \arg2 + mov \arg2,#0x01 + .endif + .endif + + .if \key > 0 + prev=\key-1 + .ifb \arg1 + aes_key_expand_next \key,%prev,rcon + .endif + + .ifnb \arg1 + aes_key_expand_next \key,%prev,\arg1 + .endif + .endif +.endm + +.macro aes_round block:req,key:req,mode:req + .if \key < 9 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + aesmc \block\().16b,\block\().16b + .else + aesd \block\().16b,vKey\key\().16b + aesimc \block\().16b,\block\().16b + .endif + .endif + .if \key == 9 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + .else + aesd \block\().16b,vKey\key\().16b + .endif + .endif + .if \key == 10 + eor \block\().16b,\block\().16b,vKey\key\().16b + .endif +.endm + +.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key + .if \key < 9 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + aesmc \b0\().16b,\b0\().16b + aese \b1\().16b,vKey\key\().16b + aesmc \b1\().16b,\b1\().16b + aese \b2\().16b,vKey\key\().16b + aesmc \b2\().16b,\b2\().16b + aese \b3\().16b,vKey\key\().16b + aesmc \b3\().16b,\b3\().16b + .else + aesd \b0\().16b,vKey\key\().16b + aesimc \b0\().16b,\b0\().16b + aesd \b1\().16b,vKey\key\().16b + aesimc \b1\().16b,\b1\().16b + aesd \b2\().16b,vKey\key\().16b + aesimc \b2\().16b,\b2\().16b + aesd \b3\().16b,vKey\key\().16b + aesimc \b3\().16b,\b3\().16b + .endif + .endif + + .if \key == 9 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aese \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aese \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aese \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .else + aesd \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aesd \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aesd \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aesd \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .endif + .endif +.endm + +.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode + aes_round_interleave \b0,\b1,\b2,\b3,0,\mode + aes_round_interleave \b0,\b1,\b2,\b3,1,\mode + aes_round_interleave \b0,\b1,\b2,\b3,2,\mode + aes_round_interleave \b0,\b1,\b2,\b3,3,\mode + aes_round_interleave \b0,\b1,\b2,\b3,4,\mode + aes_round_interleave \b0,\b1,\b2,\b3,5,\mode + aes_round_interleave \b0,\b1,\b2,\b3,6,\mode + aes_round_interleave \b0,\b1,\b2,\b3,7,\mode + aes_round_interleave \b0,\b1,\b2,\b3,8,\mode + aes_round_interleave \b0,\b1,\b2,\b3,9,\mode,10 +.endm + +.macro aes_rounds blk:req,mode:req + aes_round \blk,0,\mode + aes_round \blk,1,\mode + aes_round \blk,2,\mode + aes_round \blk,3,\mode + aes_round \blk,4,\mode + aes_round \blk,5,\mode + aes_round \blk,6,\mode + aes_round \blk,7,\mode + aes_round \blk,8,\mode + aes_round \blk,9,\mode + aes_round \blk,10,\mode +.endm + +/* load k1/k2 from memory and encrypt the tweak by k2 + * boths keys will share the same set of registers + * but will never overlap (k2 is used only once and discarded) + */ +.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req + ldp qKey0,qKey1,[\k2],#32 + aes_enc_round \iv,0 + ldp qKey2,qKey3,[\k2],#32 + aes_enc_round \iv,1 + ldp qKey0,qKey1,[\k1],#32 + aes_enc_round \iv,2 + ldp qKey4,qKey5,[\k2],#32 + aes_enc_round \iv,3 + ldp qKey2,qKey3,[\k1],#32 + aes_enc_round \iv,4 + ldp qKey6,qKey7,[\k2],#32 + aes_enc_round \iv,5 + ldp qKey4,qKey5,[\k1],#32 + aes_enc_round \iv,6 + ldp qKey8,qKey9,[k2],#32 + aes_enc_round \iv,7 + ldp qKey6,qKey7,[\k1],#32 + aes_enc_round \iv,8 + ld1 {vKey10.16b},[\k2],#16 + aes_enc_round \iv,9 + ldp qKey8,qKey9,[\k1],#32 + aes_enc_round \iv,10 + ld1 {vKey10.16b},[\k1],#16 +.endm + +.macro save_stack + stp d8,d9,[sp, -32]! + add tmpbuf,sp,16 +.endm + +.macro restore_stack + ldp d8,d9,[sp],32 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S new file mode 100644 index 000000000..ceae2d3c0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +.macro vswap vec1:req,vec2:req + mov vtmp.16b,\vec1\().16b + mov \vec1\().16b,\vec2\().16b + mov \vec2\().16b,vtmp.16b +.endm + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used only once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_key_expand 1 + aes_enc_round \iv,1 + aes_key_expand 0,\k1,rcon2 + aes_key_expand 2 + aes_enc_round \iv,2 + aes_key_expand 1,rcon2 + aes_key_expand 3 + aes_enc_round \iv,3 + aes_key_expand 2,rcon2 + aes_key_expand 4 + aes_enc_round \iv,4 + aes_key_expand 3,rcon2 + aes_key_expand 5 + aes_enc_round \iv,5 + aes_key_expand 4,rcon2 + aes_key_expand 6 + aes_enc_round \iv,6 + aes_key_expand 5,rcon2 + aes_key_expand 7 + aes_enc_round \iv,7 + aes_key_expand 6,rcon2 + aes_key_expand 8 + aes_enc_round \iv,8 + aes_key_expand 7,rcon2 + aes_key_expand 9 + aes_enc_round \iv,9 + aes_key_expand 8,rcon2 + aes_key_expand 10 + aes_enc_round \iv,10 + aes_key_expand 9,rcon2 + aes_key_expand 10,rcon2 + + // transform encryption key into decrption key + aesimc vKey1.16b,vKey1.16b + vswap vKey0,vKey10 + aesimc vKey9.16b,vKey9.16b + + aesimc vKey2.16b,vKey2.16b + aesimc vKey8.16b,vKey8.16b + vswap vKey1,vKey9 + + aesimc vKey3.16b,vKey3.16b + aesimc vKey7.16b,vKey7.16b + vswap vKey2,vKey8 + + aesimc vKey4.16b,vKey4.16b + aesimc vKey6.16b,vKey6.16b + vswap vKey3,vKey7 + + aesimc vKey5.16b,vKey5.16b + vswap vKey4,vKey6 +.endm + +/* + * void XTS_AES_128_dec_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_128_dec_ce + .type XTS_AES_128_dec_ce, %function +XTS_AES_128_dec_ce: + xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_128_dec_ce, .-XTS_AES_128_dec_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S new file mode 100644 index 000000000..23ed14a38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S @@ -0,0 +1,91 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_key_expand 1 + aes_enc_round \iv,1 + aes_key_expand 0,\k1,rcon2 + aes_key_expand 2 + aes_enc_round \iv,2 + aes_key_expand 1,rcon2 + aes_key_expand 3 + aes_enc_round \iv,3 + aes_key_expand 2,rcon2 + aes_key_expand 4 + aes_enc_round \iv,4 + aes_key_expand 3,rcon2 + aes_key_expand 5 + aes_enc_round \iv,5 + aes_key_expand 4,rcon2 + aes_key_expand 6 + aes_enc_round \iv,6 + aes_key_expand 5,rcon2 + aes_key_expand 7 + aes_enc_round \iv,7 + aes_key_expand 6,rcon2 + aes_key_expand 8 + aes_enc_round \iv,8 + aes_key_expand 7,rcon2 + aes_key_expand 9 + aes_enc_round \iv,9 + aes_key_expand 8,rcon2 + aes_key_expand 10 + aes_enc_round \iv,10 + aes_key_expand 9,rcon2 + aes_key_expand 10,rcon2 +.endm + + +/* + * void XTS_AES_128_enc_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< cleartext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_128_enc_ce + .type XTS_AES_128_enc_ce, %function +XTS_AES_128_enc_ce: + xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_128_enc_ce, .-XTS_AES_128_enc_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S new file mode 100644 index 000000000..e6535dba3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S @@ -0,0 +1,247 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.altmacro +.macro aes_key_expand_next out0:req,out1:req,in0:req,in1:req,ctx:req + dup vdest.4s,vKey\in1\().s[3] + ext vtmp.16b,vzero.16b,vKey\in0\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\out0\().16b,vKey\in0\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + mov tmpw,vdest.s[0] + eor tmpw,\ctx,tmpw,ror 8 + dup vdest.4s,tmpw + eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b + mov \ctx,ctx,lsl 1 + eor vKey\out0\().16b,vKey\out0\().16b,vdest.16b + + .if \out1 < 14 + dup vdest.4s, vKey\out0\().s[3] + ext vtmp.16b, vzero.16b,vKey\in1\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\out1\().16b,vKey\in1\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b + eor vKey\out1\().16b,vKey\out1\().16b,vdest.16b + .endif +.endm + +/* when loadin key = 0 + * arg1 = input key + * arg2 = rcon ctx register (optional) + * when loading key > 0 + * arg1 = rcon ctx register (optional) + */ +.macro aes_key_expand key:req,arg1,arg2 + .if \key == 0 + ld1 {vKey0.4s,vKey1.4s},[\arg1] + movi vzero.4s, 0 + .ifb \arg2 + mov rcon,#0x01 + .endif + + .ifnb \arg2 + mov \arg2,#0x01 + .endif + .endif + + .if \key > 0 + in0=\key-2 + in1=\key-1 + out0=\key + out1=\key+1 + .ifb \arg1 + aes_key_expand_next %out0,%out1,%in0,%in1,rcon + .endif + + .ifnb \arg1 + aes_key_expand_next %out0,%out1,%in0,%in1,\arg1 + .endif + .endif +.endm + +.macro aes_round block:req,key:req,mode:req + .if \key < 13 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + aesmc \block\().16b,\block\().16b + .else + aesd \block\().16b,vKey\key\().16b + aesimc \block\().16b,\block\().16b + .endif + .endif + .if \key == 13 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + .else + aesd \block\().16b,vKey\key\().16b + .endif + .endif + .if \key == 14 + eor \block\().16b,\block\().16b,vKey\key\().16b + .endif +.endm + +.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key + .if \key < 13 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + aesmc \b0\().16b,\b0\().16b + aese \b1\().16b,vKey\key\().16b + aesmc \b1\().16b,\b1\().16b + aese \b2\().16b,vKey\key\().16b + aesmc \b2\().16b,\b2\().16b + aese \b3\().16b,vKey\key\().16b + aesmc \b3\().16b,\b3\().16b + .else + aesd \b0\().16b,vKey\key\().16b + aesimc \b0\().16b,\b0\().16b + aesd \b1\().16b,vKey\key\().16b + aesimc \b1\().16b,\b1\().16b + aesd \b2\().16b,vKey\key\().16b + aesimc \b2\().16b,\b2\().16b + aesd \b3\().16b,vKey\key\().16b + aesimc \b3\().16b,\b3\().16b + .endif + .endif + + .if \key == 13 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aese \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aese \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aese \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .else + aesd \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aesd \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aesd \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aesd \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .endif + .endif +.endm + + + +.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode + aes_round_interleave \b0,\b1,\b2,\b3,0,\mode + aes_round_interleave \b0,\b1,\b2,\b3,1,\mode + aes_round_interleave \b0,\b1,\b2,\b3,2,\mode + aes_round_interleave \b0,\b1,\b2,\b3,3,\mode + aes_round_interleave \b0,\b1,\b2,\b3,4,\mode + aes_round_interleave \b0,\b1,\b2,\b3,5,\mode + aes_round_interleave \b0,\b1,\b2,\b3,6,\mode + aes_round_interleave \b0,\b1,\b2,\b3,7,\mode + aes_round_interleave \b0,\b1,\b2,\b3,8,\mode + aes_round_interleave \b0,\b1,\b2,\b3,9,\mode + aes_round_interleave \b0,\b1,\b2,\b3,10,\mode + aes_round_interleave \b0,\b1,\b2,\b3,11,\mode + aes_round_interleave \b0,\b1,\b2,\b3,12,\mode + aes_round_interleave \b0,\b1,\b2,\b3,13,\mode,14 +.endm + + +.macro aes_rounds blk:req,mode:req + aes_round \blk,0,\mode + aes_round \blk,1,\mode + aes_round \blk,2,\mode + aes_round \blk,3,\mode + aes_round \blk,4,\mode + aes_round \blk,5,\mode + aes_round \blk,6,\mode + aes_round \blk,7,\mode + aes_round \blk,8,\mode + aes_round \blk,9,\mode + aes_round \blk,10,\mode + aes_round \blk,11,\mode + aes_round \blk,12,\mode + aes_round \blk,13,\mode + aes_round \blk,14,\mode +.endm + +/* load k1/k2 from memory and encrypt the tweak by k2 + * boths keys will share the same set of registers + * but will never overlap (k2 is used only once and discarded) + */ +.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req + ldp qKey0,qKey1,[\k2],#32 + aes_enc_round \iv,0 + ldp qKey2,qKey3,[\k2],#32 + aes_enc_round \iv,1 + ldp qKey0,qKey1,[\k1],#32 + aes_enc_round \iv,2 + ldp qKey4,qKey5,[\k2],#32 + aes_enc_round \iv,3 + ldp qKey2,qKey3,[\k1],#32 + aes_enc_round \iv,4 + ldp qKey6,qKey7,[\k2],#32 + aes_enc_round \iv,5 + ldp qKey4,qKey5,[\k1],#32 + aes_enc_round \iv,6 + ldp qKey8,qKey9,[k2],#32 + aes_enc_round \iv,7 + ldp qKey6,qKey7,[\k1],#32 + aes_enc_round \iv,8 + ldp qKey10,qKey11,[k2],#32 + aes_enc_round \iv,9 + ldp qKey8,qKey9,[\k1],#32 + aes_enc_round \iv,10 + ldp qKey12,qKey13,[k2],#32 + aes_enc_round \iv,11 + ldp qKey10,qKey11,[\k1],#32 + aes_enc_round \iv,12 + ld1 {vKey14.16b},[k2],#16 + aes_enc_round \iv,13 + ldp qKey12,qKey13,[\k1],#32 + aes_enc_round \iv,14 + ld1 {vKey14.16b},[\k1],#16 +.endm + +.macro save_stack + stp d8,d9,[sp, -48]! + stp d10,d11,[sp, 16] + add tmpbuf,sp,32 +.endm + +.macro restore_stack + ldp d10,d11,[sp, 16] + ldp d8,d9,[sp], 48 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S new file mode 100644 index 000000000..aa46ded08 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +.macro vswap vec1:req,vec2:req + mov vtmp.16b,\vec1\().16b + mov \vec1\().16b,\vec2\().16b + mov \vec2\().16b,vtmp.16b +.endm + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used only once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_enc_round \iv,1 + aes_key_expand 2 + aes_key_expand 0,\k1,rcon2 + aes_enc_round \iv,2 + aes_enc_round \iv,3 + aes_key_expand 4 + aes_key_expand 2,rcon2 + aes_enc_round \iv,4 + aes_enc_round \iv,5 + aes_key_expand 6 + aes_key_expand 4,rcon2 + aes_enc_round \iv,6 + aes_enc_round \iv,7 + aes_key_expand 8 + aes_key_expand 6,rcon2 + aes_enc_round \iv,8 + aes_enc_round \iv,9 + aes_key_expand 10 + aes_key_expand 8,rcon2 + aes_enc_round \iv,10 + aes_enc_round \iv,11 + aes_key_expand 12 + aes_key_expand 10,rcon2 + aes_enc_round \iv,12 + aes_enc_round \iv,13 + aes_key_expand 14 + aes_key_expand 12,rcon2 + aes_enc_round \iv,14 + aes_key_expand 14,rcon2 + + // transform encryption key into decrption key + aesimc vKey1.16b,vKey1.16b + vswap vKey0,vKey14 + aesimc vKey13.16b,vKey13.16b + aesimc vKey2.16b,vKey2.16b + vswap vKey1,vKey13 + aesimc vKey12.16b,vKey12.16b + aesimc vKey3.16b,vKey3.16b + vswap vKey2,vKey12 + aesimc vKey11.16b,vKey11.16b + aesimc vKey4.16b,vKey4.16b + vswap vKey3,vKey11 + aesimc vKey10.16b,vKey10.16b + aesimc vKey5.16b,vKey5.16b + vswap vKey4,vKey10 + aesimc vKey9.16b,vKey9.16b + aesimc vKey6.16b,vKey6.16b + vswap vKey5,vKey9 + aesimc vKey8.16b,vKey8.16b + aesimc vKey7.16b,vKey7.16b + vswap vKey6,vKey8 +.endm + +/* + * void XTS_AES_256_dec_ce( + * uint8_t *k2, //!< key used for tweaking, 32 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 32 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_256_dec_ce + .type XTS_AES_256_dec_ce, %function +XTS_AES_256_dec_ce: + xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_256_dec_ce, .-XTS_AES_256_dec_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S new file mode 100644 index 000000000..8e4088a4d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S @@ -0,0 +1,88 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_enc_round \iv,1 + aes_key_expand 2 + aes_key_expand 0,\k1,rcon2 + aes_enc_round \iv,2 + aes_enc_round \iv,3 + aes_key_expand 4 + aes_key_expand 2,rcon2 + aes_enc_round \iv,4 + aes_enc_round \iv,5 + aes_key_expand 6 + aes_key_expand 4,rcon2 + aes_enc_round \iv,6 + aes_enc_round \iv,7 + aes_key_expand 8 + aes_key_expand 6,rcon2 + aes_enc_round \iv,8 + aes_enc_round \iv,9 + aes_key_expand 10 + aes_key_expand 8,rcon2 + aes_enc_round \iv,10 + aes_enc_round \iv,11 + aes_key_expand 12 + aes_key_expand 10,rcon2 + aes_enc_round \iv,12 + aes_enc_round \iv,13 + aes_key_expand 14 + aes_key_expand 12,rcon2 + aes_enc_round \iv,14 + aes_key_expand 14,rcon2 +.endm + +/* + * void XTS_AES_256_enc_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< cleartext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_256_enc_ce + .type XTS_AES_256_enc_ce, %function +XTS_AES_256_enc_ce: + xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_256_enc_ce, .-XTS_AES_256_enc_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S new file mode 100644 index 000000000..c32a13820 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S @@ -0,0 +1,232 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.macro declare_var_vector_reg name:req,reg:req +.ifdef q\name + .unreq q\name + .unreq v\name + .unreq s\name + .unreq d\name +.endif + .set q\name , \reg + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm + + declare_var_vector_reg zero ,0 + declare_var_vector_reg tmp,1 + declare_var_vector_reg mask,2 + declare_var_vector_reg dest,3 + declare_var_vector_reg blk0,4 + declare_var_vector_reg blk1,5 + declare_var_vector_reg blk2,6 + declare_var_vector_reg blk3,7 + declare_var_vector_reg Key11,8 + declare_var_vector_reg Key12,9 + declare_var_vector_reg Key13,10 + declare_var_vector_reg Key14,11 + declare_var_vector_reg SavedIv,16 + declare_var_vector_reg IV0,17 + declare_var_vector_reg IV1,18 + declare_var_vector_reg IV2,19 + declare_var_vector_reg IV3,20 + declare_var_vector_reg Key0,21 + declare_var_vector_reg Key1,22 + declare_var_vector_reg Key2,23 + declare_var_vector_reg Key3,24 + declare_var_vector_reg Key4,25 + declare_var_vector_reg Key5,26 + declare_var_vector_reg Key6,27 + declare_var_vector_reg Key7,28 + declare_var_vector_reg Key8,29 + declare_var_vector_reg Key9,30 + declare_var_vector_reg Key10,31 + +.macro aes_enc_round block:req,key:req + aes_round \block,\key,0 +.endm + +.macro aes_dec_round block:req,key:req + aes_round \block,\key,1 +.endm + +.macro update_iv current:req,next:req + mov ivh,\current\().d[1] + mov ivl,\current\().d[0] + mov tmpw,#0x87 + extr tmpx2,ivh,ivh,#32 + extr ivh,ivh,ivl,#63 + and tmpw,tmpw,tmpw2,asr#31 + eor ivl,tmpx,ivl,lsl#1 + mov \next\().d[1],ivh + mov \next\().d[0],ivl +.endm + +.macro process_4_blks inp:req,outp:req,mode:req,is_tail + update_iv vIV0,vIV1 + update_iv vIV1,vIV2 + ldp qblk0,qblk1,[\inp],#32 + ldp qblk2,qblk3,[\inp],#32 + .ifnb \is_tail + update_iv vIV2, vSavedIv + update_iv vSavedIv,vIV3 + .else + update_iv vIV2,vIV3 + .endif + eor vblk0.16b,vblk0.16b,vIV0.16b + eor vblk1.16b,vblk1.16b,vIV1.16b + eor vblk2.16b,vblk2.16b,vIV2.16b + eor vblk3.16b,vblk3.16b,vIV3.16b + + aes_rounds_interleave vblk0,vblk1,vblk2,vblk3,\mode + eor vblk0.16b,vblk0.16b,vIV0.16b + eor vblk1.16b,vblk1.16b,vIV1.16b + stp qblk0,qblk1,[\outp],#32 + eor vblk2.16b,vblk2.16b,vIV2.16b + eor vblk3.16b,vblk3.16b,vIV3.16b + stp qblk2,qblk3,[\outp],#32 + .ifb \is_tail + update_iv vIV3,vIV0 + .endif +.endm + +.macro process_1_blk inp:req,outp:req,mode:req + ld1 {vblk0.16b},[\inp],#16 + eor vblk0.16b,vblk0.16b,vIV0.16b + aes_rounds vblk0,\mode + eor vblk0.16b,vblk0.16b,vIV0.16b + str qblk0,[\outp], #16 +.endm + + key2 .req x0 + key1 .req x1 + iv .req x2 + bytes .req x3 + inp .req x4 + outp .req x5 + rcon .req w6 + blocks .req x7 + tmpx .req x8 + tmpw .req w8 + tmpw2 .req w9 + tmpx2 .req x9 + ivl .req x10 + ivh .req x11 + lastblk .req x12 + tmpbuf .req x13 + tailcnt .req x14 + rcon2 .req w15 + +.macro xts_aes_crypt mode:req,expander,more:vararg + save_stack + + ld1 {vIV0.16b},[iv],16 + .ifnb \expander + \expander\() \more + .endif + lsr blocks,bytes,4 + and tailcnt,bytes,#0x0F + + cmp bytes,16 + b.lt .return + +.process_4_blks: + cmp blocks, 4 + b.lt .singles + subs blocks,blocks,4 + /* in decryption mode, check whether this is + * last block before the less-than-one-block tail + * need to swap tweak in this case + */ + .if \mode == 1 + b.gt .not_tail_4blk + cmp tailcnt,1 + b.lt .not_tail_4blk + process_4_blks inp,outp,\mode,1 + b .process_4_blks +.not_tail_4blk: + .endif + process_4_blks inp,outp,\mode + b .process_4_blks + +.singles: + subs blocks,blocks,#1 + b.lt .checktail + /* in decryption mode, check whether this is + *last block before the less-than-one-block tail + * need to swap tweak in this case + */ + .if \mode == 1 + b.gt .not_tail_1blk + cmp tailcnt,1 + b.lt .not_tail_1blk + mov vSavedIv.16b, vIV0.16b + update_iv vSavedIv, vIV0 + process_1_blk inp,outp,\mode + b .checktail +.not_tail_1blk: + .endif + process_1_blk inp,outp,\mode + update_iv vIV0,vIV0 + b .singles +.checktail: + cmp tailcnt,1 + b.lt .return + sub lastblk,outp,#16 +.copytail: + subs tailcnt,tailcnt,#1 + ldrb tmpw,[lastblk,tailcnt] + strb tmpw,[outp,tailcnt] + ldrb tmpw,[inp,tailcnt] + strb tmpw,[tmpbuf,tailcnt] + b.gt .copytail + and tailcnt,bytes,#0x0F +.steal: + cmp tailcnt,15 + ldrb tmpw,[lastblk,tailcnt] + strb tmpw,[tmpbuf,tailcnt] + add tailcnt,tailcnt,#1 + b.lt .steal + .if \mode == 1 + mov vIV0.16b,vSavedIv.16b + .endif + process_1_blk tmpbuf,lastblk,\mode +.return: + restore_stack + ret +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S new file mode 100644 index 000000000..9549ebfa0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_128_dec_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used + * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_128_dec_expanded_key_ce + .type XTS_AES_128_dec_expanded_key_ce, %function +XTS_AES_128_dec_expanded_key_ce: + xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_128_dec_expanded_key_ce, .-XTS_AES_128_dec_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S new file mode 100644 index 000000000..1f2d2db2e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_128_enc_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes + * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< plaintext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_128_enc_expanded_key_ce + .type XTS_AES_128_enc_expanded_key_ce, %function +XTS_AES_128_enc_expanded_key_ce: + xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_128_enc_expanded_key_ce, .-XTS_AES_128_enc_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S new file mode 100644 index 000000000..95c8bf63d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_256_dec_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used + * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_256_dec_expanded_key_ce + .type XTS_AES_256_dec_expanded_key_ce, %function +XTS_AES_256_dec_expanded_key_ce: + xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_256_dec_expanded_key_ce, .-XTS_AES_256_dec_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S new file mode 100644 index 000000000..bd840a994 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_256_enc_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes + * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< plaintext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_256_enc_expanded_key_ce + .type XTS_AES_256_enc_expanded_key_ce, %function +XTS_AES_256_enc_expanded_key_ce: + xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_256_enc_expanded_key_ce, .-XTS_AES_256_enc_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S new file mode 100644 index 000000000..af77d885b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S @@ -0,0 +1,39 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface XTS_AES_128_enc +mbin_interface XTS_AES_128_dec +mbin_interface XTS_AES_128_enc_expanded_key +mbin_interface XTS_AES_128_dec_expanded_key +mbin_interface XTS_AES_256_enc +mbin_interface XTS_AES_256_dec +mbin_interface XTS_AES_256_enc_expanded_key +mbin_interface XTS_AES_256_dec_expanded_key diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm new file mode 100644 index 000000000..22f00b395 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm @@ -0,0 +1,377 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _AES_COMMON_ASM_ +%define _AES_COMMON_ASM_ + +%include "reg_sizes.asm" + +;; ============================================================================= +;; Generic macro to produce code that executes %%OPCODE instruction +;; on selected number of AES blocks (16 bytes long ) between 0 and 16. +;; All three operands of the instruction come from registers. +;; Note: if 3 blocks are left at the end instruction is produced to operate all +;; 4 blocks (full width of ZMM) + +%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OPCODE %2 ; [in] instruction name +%define %%DST0 %3 ; [out] destination ZMM register +%define %%DST1 %4 ; [out] destination ZMM register +%define %%DST2 %5 ; [out] destination ZMM register +%define %%DST3 %6 ; [out] destination ZMM register +%define %%SRC1_0 %7 ; [in] source 1 ZMM register +%define %%SRC1_1 %8 ; [in] source 1 ZMM register +%define %%SRC1_2 %9 ; [in] source 1 ZMM register +%define %%SRC1_3 %10 ; [in] source 1 ZMM register +%define %%SRC2_0 %11 ; [in] source 2 ZMM register +%define %%SRC2_1 %12 ; [in] source 2 ZMM register +%define %%SRC2_2 %13 ; [in] source 2 ZMM register +%define %%SRC2_3 %14 ; [in] source 2 ZMM register + +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%undef %%DSTREG +%undef %%SRC1REG +%undef %%SRC2REG +%assign reg_idx (reg_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep + +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + +%if blocks_left == 1 + %%OPCODE XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG) +%elif blocks_left == 2 + %%OPCODE YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG) +%elif blocks_left == 3 + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers +;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left +;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert) +;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded +%macro ZMM_LOAD_BLOCKS_0_16 7-8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%FLAGS %8 ; [in] optional "load_4_instead_of_3" + +%assign src_offset 0 +%assign dst_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 3 +%ifidn %%FLAGS, load_4_instead_of_3 + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%else + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] + vinserti64x2 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2 +%endif +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers using mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Loads take place at 1 byte granularity. +%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%MASK %8 ; [in] mask register + +%assign src_offset 0 +%assign dst_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers +%macro ZMM_STORE_BLOCKS_0_16 7 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store + +%assign dst_offset 0 +%assign src_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) +%elif blocks_left == 3 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) + vextracti32x4 [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2 +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers with mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Stores take place at 1 byte granularity. +%macro ZMM_STORE_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store +%define %%MASK %8 ; [in] mask register + +%assign dst_offset 0 +%assign src_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG) +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG +%endif + +%endmacro + +;;; =========================================================================== +;;; Handles AES encryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameterto check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; plain/cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; plain/cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; plain/cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; plain/cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +;;; =========================================================================== +;;; Handles AES decryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameter to check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +%endif ;; _AES_COMMON_ASM diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm new file mode 100644 index 000000000..2a879abdd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm @@ -0,0 +1,431 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; the following defines control the operation of the macros below and +; need to be defines in the including file +; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15 +; EARLY_BLOCKS - number of data block to load before starting computations +; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data +; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1 +; TMP_CNT - number of tmp xmm register to reserve +; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2 +; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; +; the following instructions set specific macros must be defined in the user file +; to make use of the AES macros below +; MOVDQ - move from memory to xmm reg +; PXOR - XOR of two xmm registers pxor +; AES_DEC - AES block decode for early key rounds +; AES_DEC_LAST - AES block decode for last key round +; or +; AES_ENC - AES block encode for early key rounds +; AES_ENC_LAST - AES block encode for last key round + +; Three usages of xmm regs: key round cache, blocks data and one temp +; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg +%assign FIRST_XDATA (0) +%assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS) +%ifndef IV_CNT +%define IV_CNT (1) +%endif +%assign TMP (IV_IDX + IV_CNT) +%assign TMP_CNT (2) +%assign FIRST_CKEY (TMP + TMP_CNT) +%assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT)) + +; Abstract xmm register usages that identify the expected contents of the register +%define reg(i) xmm %+ i +%define XDATA(i) xmm %+ i +%define KEY_REG(i) xmm %+ i +%define IV_REG(i) xmm %+ i + +%define IDX rax + + + + +; +; +; AES CBC ENCODE MACROS +; +; + +; +; CBC_DECRYPT_BLOCKS +; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro +; Finalized the decryption and saves results in the output +; places last last buffers crypto text in IV for next buffer +; updates the index and number of bytes left +; +%macro CBC_DECRYPT_BLOCKS 17 +%define %%TOT_ROUNDS %1 +%define %%num_blocks %2 ; can be 0..13 +%define %%EARLY_LOADS %3 ; number of data blocks to laod before processing +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%CACHED_KEYS %8 ; number of key data cached in xmm regs +%define %%TMP %9 +%define %%TMP_CNT %10 +%define %%FIRST_CKEY %11 +%define %%KEY_DATA %12 +%define %%FIRST_XDATA %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers +%define %%LEN %17 + + AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX + + ; + ; XOR the result of each block's decrypt with the previous block's cypher text (C) + ; + %assign i 0 + %rep (%%num_blocks) + %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C + %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out + %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C + %assign i (i+1) + %endrep + + add %%IDX, %%num_blocks*16 + sub %%LEN, %%num_blocks*16 +%endmacro + + +; +; CBC_ENC_INIT +; XOR first data block with the IV data +%macro CBC_ENC_INIT 7 +%define %%P_FIRST %1 +%define %%IV_IDX %2 +%define %%MOVDQ %3 +%define %%PXOR %4 +%define %%IV %5 +%define %%IN %6 ; input data +%define %%IDX %7 ; index into input and output data buffers + + %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16] + %%MOVDQ reg(%%IV_IDX), [%%IV] + %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX) +%endmacro + +; +; assumptions: +; LEN is length of data remaining +; IDX is offset into the data buffer +; +; subloops +; if data > 16 load next block into a next XDATA reg (XDATA(p_next)) +; load first uncached key into TMP0 (if any) +; AES block encript XDATA(P_FIRST) +; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST)) +; save current (XDATA(P_FIRST)) +; update indexes for P_FIRST +; end if data zero +; +%macro CBC_ENC_SUBLOOP 17 +%define %%TOT_ROUNDS %1 +%define %%BLOCKS %2 ; can be 1...14 +%define %%START_DATA %3 +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%TMP %8 +%define %%TMP_CNT %9 +%define %%FIRST_CKEY %10 +%define %%CKEY_CNT %11 +%define %%KEYS %12 +%define %%CACHED_KEYS %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers +%define %%LEN %17 + + %assign this_blk 0 + %assign next_blk 1 + %assign p_first %%START_DATA + %assign p_next (p_first+1) + ; for number of blocks to be processed in a loop + %assign blk 1 + %rep %%BLOCKS + ; if data > 16 load next block into a next XDATA reg (XDATA(p_next)) + cmp %%LEN, 16 + %push skip_read + je %$skip_read_next + %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16] + %$skip_read_next: + %pop + + AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST + + ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first)) + cmp %%LEN, 16 + %push skip_next + je %$skip_next_blk_start + %%PXOR XDATA(p_next), XDATA(p_first) + %$skip_next_blk_start: + %pop + + ; save current (XDATA(p_first)) + %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first) + ; update indexes for p_first + add %%IDX, 16 + sub %%LEN, 16 + + %if (blk < %%BLOCKS) ; only insert jz if NOT last block + ; end if data zero + jz %%END_CBC_ENC_SUBLOOP + %endif ; (p_next < %%BLOCKS) + + %assign p_first (p_next) + %assign blk (blk+1) + %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA + %assign p_next (%%START_DATA) + %elif (1 == %%BLOCKS) + %%MOVDQ XDATA(%%START_DATA), XDATA(p_next) + %else + %assign p_next (p_next+1) + %endif + %endrep ; %%BLOCKS + + %%END_CBC_ENC_SUBLOOP: +%endm ; CBC_ENC_SUBLOOP + + +; +; +; AES BLOCK ENCODE MACROS +; +; + +; +; FILL_KEY_CACHE +; Load key data into the cache key xmm regs +%macro FILL_KEY_CACHE 4 +%define %%CACHED_KEYS %1 +%define %%CKEY_START %2 +%define %%KEY_DATA %3 +%define %%MOVDQ %4 + + %assign rnd 0 + %rep KEY_ROUNDS + %if (rnd < %%CACHED_KEYS) ; find the round's key data + %assign c (rnd + %%CKEY_START) + %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register + %endif + %assign rnd (rnd+1) + %endrep +%endmacro + +; +; SCHEDULE_DATA_LOAD +; pre-loades message data into xmm regs +; updates global 'blocks_loaded' that tracks which data blocks have been loaded +; 'blocks_loaded' is an in/out global and must be declared in the using macro or function +%macro SCHEDULE_DATA_LOAD 5 +%define %%PARALLEL_DATA %1 +%define %%EARLY_LOADS %2 +%define %%MOVDQ %3 +%define %%IN %4 +%define %%IDX %5 + + %if (blocks_loaded < %%PARALLEL_DATA) + ; load cipher text + %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16] + %assign blocks_loaded (blocks_loaded+1) + %endif ; (blocks_loaded < %%PARALLEL_DATA) +%endmacro ; SCHEDULED_EARLY_DATA_LOADS + +; +; INIT_SELECT_KEY +; determine which xmm reg holds the key data needed or loades it into the temp register if not cached +; 'current_tmp' is an in/out global and must be declared in the using macro or function +%macro INIT_SELECT_KEY 6 +%define %%TOT_ROUNDS %1 +%define %%CACHED_KEYS %2 +%define %%KEY_DATA %3 +%define %%FIRST_TMP %4 +%define %%TMP_CNT %5 +%define %%MOVDQ %6 + + %assign current_tmp (%%FIRST_TMP) + %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16] + %endif ; (KEY_ROUNDS > CKEY_CNT) +%endmacro ; SELECT_KEY + +; +; SELECT_KEY +; determine which xmm reg holds the key data needed or loades it into the temp register if not cached +; 'current_tmp' is an in/out global and must be declared in the using macro or function +%macro SELECT_KEY 8 +%define %%ROUND %1 +%define %%TOT_ROUNDS %2 +%define %%CACHED_KEYS %3 +%define %%FIRST_KEY %4 +%define %%KEY_DATA %5 +%define %%FIRST_TMP %6 +%define %%TMP_CNT %7 +%define %%MOVDQ %8 + + ; find the key data for this round + %if (%%ROUND < %%CACHED_KEYS) ; is it cached + %assign key (%%ROUND + %%FIRST_KEY) + %else + ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one + %assign key (current_tmp) ; use the previous loaded key data + %if (1 == %%TMP_CNT) + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data + %else + %assign next_round (%%ROUND+1) + %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done + %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use + %assign current_tmp (current_tmp + 1) + %else + %assign current_tmp (%%FIRST_TMP) + %endif ; (current_tmp == %%FIRST_TMP) + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data + + %endif ; (%%ROUND < KEY_ROUNDS) + %endif ; (1 < %%TMP_CNT) + %endif ; (%%ROUND < %%CACHED_KEYS) +%endmacro ; SELECT_KEY + + +; +; AES_PARALLEL_ENC_BLOCKS +; preloads some data blocks to be worked on +; starts the aes block encoding while loading the other blocks to be done in parallel +; aes block encodes each key round on each block +%macro AES_PARALLEL_ENC_BLOCKS 16 +%define %%KEY_ROUNDS %1 +%define %%PARALLEL_DATA %2 +%define %%EARLY_LOADS %3 +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%CACHED_KEYS %8 +%define %%TMP %9 +%define %%TMP_CNT %10 +%define %%FIRST_CKEY %11 +%define %%KEY_DATA %12 +%define %%FIRST_XDATA %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers + + %assign blocks_loaded 0 + + %rep %%EARLY_LOADS + SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded + %endrep ; %%EARLY_LOADS + + %assign current_tmp (TMP) + INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign round 0 + %assign key 0 + %rep KEY_ROUNDS ; for all key rounds + SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign i %%FIRST_XDATA + %rep %%PARALLEL_DATA ; for each block do the EAS block encode step + %if (0 == round) + %%PXOR XDATA(i), KEY_REG(key) ; first round's step + SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX + + %elif ( (%%KEY_ROUNDS-1) == round ) + %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step + + %else + %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step + + %endif + %assign i (i+1) + %endrep ;%%PARALLEL_DATA + %assign round (round+1) + %endrep ;KEY_ROUNDS +%endmacro ; AES_PARALLEL_ENC_BLOCKS + + + +; +; AES_ENC_BLOCKS +; load first uncached key into TMP0 (if any) +; AES block encript XDATA(p_first) +; before using uncached key in TMP0, load next key in TMP1 +; before using uncached key in TMP1, load next key in TMP0 +%macro AES_ENC_BLOCKS 11 +%define %%TOT_ROUNDS %1 +%define %%ENC_BLOCK %2 +%define %%TMP %3 +%define %%TMP_CNT %4 +%define %%FIRST_CKEY %5 +%define %%CACHED_KEYS %6 +%define %%KEY_DATA %7 +%define %%MOVDQ %8 +%define %%PXOR %9 +%define %%AES_ENC %10 +%define %%AES_ENC_LAST %11 + + %assign current_tmp (%%TMP) + INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign round 0 + %assign key (round + %%FIRST_CKEY) + %rep %%TOT_ROUNDS ; for all key rounds + ; find the key data for this round + SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + ; encrypt block + %if (0 == round) + %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step + %elif ( (%%TOT_ROUNDS-1) == round ) + %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step + %else + %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step + %endif ; (0 == round) + + %assign round (round+1) + %endrep ; KEY_ROUNDS +%endmacro ; AES_ENC + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm new file mode 100644 index 000000000..68aa227ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm @@ -0,0 +1,162 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_128_sse(void *in, +; uint8_t *IV, +; uint8_t keys, +; void *out, +; uint64_t len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC macros +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (2) +%define PARALLEL_BLOCKS (8) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast +%include "cbc_common.asm" + +section .text + +align 16 +mk_global aes_cbc_dec_128_sse, function +func(aes_cbc_dec_128_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm new file mode 100644 index 000000000..d4b6dfb2a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm @@ -0,0 +1,162 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; routine to do AES128 CBC decrypt +;; clobbers xmm0-15 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + +section .text + +;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_128_avx, function +func(aes_cbc_dec_128_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm new file mode 100644 index 000000000..4b017d193 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm @@ -0,0 +1,164 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_192_sse(void *in, +; uint8_t *IV, +; uint8_t keys[13], // +1 over key length +; void *out, +; uint64_t len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; + +%include "reg_sizes.asm" + +%define MOVDQ movdqu + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (2) +%define PARALLEL_BLOCKS (5) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast + +%include "cbc_common.asm" + +section .text + +mk_global aes_cbc_dec_192_sse, function +func(aes_cbc_dec_192_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm new file mode 100644 index 000000000..2791570ad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm @@ -0,0 +1,158 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES192 CBC decrypt + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_192_avx, function +func(aes_cbc_dec_192_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm new file mode 100644 index 000000000..44c76268e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm @@ -0,0 +1,161 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_256_sse(void *in, +; uint8_t *IV, +; uint8_t keys, +; void *out, +; uint64_t len_bytes); +; +; arg 1: rcx: pointer to input (cipher text) +; arg 2: rdx: pointer to IV +; arg 3: r8: pointer to keys +; arg 4: r9: pointer to output (plain text) +; arg 5: sp: length in bytes (multiple of 16) +; + +%include "reg_sizes.asm" + +%define MOVDQ movdqu + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast + +%include "cbc_common.asm" + +mk_global aes_cbc_dec_256_sse, function +func(aes_cbc_dec_256_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm new file mode 100644 index 000000000..cad1a6bef --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm @@ -0,0 +1,158 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES256 CBC decrypt + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_256_avx, function +func(aes_cbc_dec_256_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm new file mode 100644 index 000000000..6124e2def --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm @@ -0,0 +1,519 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2019-2021 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "aes_common.asm" +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +[bits 64] +default rel + +%define zIV zmm0 +%define zBLK_0_3 zmm1 +%define zBLK_4_7 zmm2 +%define zBLK_8_11 zmm3 +%define zBLK_12_15 zmm4 +%define zTMP0 zmm5 +%define zTMP1 zmm6 +%define zTMP2 zmm7 +%define zTMP3 zmm8 + +%define ZKEY0 zmm17 +%define ZKEY1 zmm18 +%define ZKEY2 zmm19 +%define ZKEY3 zmm20 +%define ZKEY4 zmm21 +%define ZKEY5 zmm22 +%define ZKEY6 zmm23 +%define ZKEY7 zmm24 +%define ZKEY8 zmm25 +%define ZKEY9 zmm26 +%define ZKEY10 zmm27 +%define ZKEY11 zmm28 +%define ZKEY12 zmm29 +%define ZKEY13 zmm30 +%define ZKEY14 zmm31 + +%ifidn __OUTPUT_FORMAT__, elf64 +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 +%define tmp2 r11 + +%ifdef CBCS +%define OFFSET 160 +%else +%define OFFSET 16 +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; macro to preload keys +;;; - uses ZKEY[0-14] registers (ZMM) +%macro LOAD_KEYS 2 +%define %%KEYS %1 ; [in] key pointer +%define %%NROUNDS %2 ; [in] numerical value, number of AES rounds + ; excluding 1st and last rounds. + ; Example: AES-128 -> value 9 + +%assign i 0 +%rep (%%NROUNDS + 2) + vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i] +%assign i (i + 1) +%endrep + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro +;;; code as the number of final blocks is variable. +;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0) + +%macro FINAL_BLOCKS 14 +%define %%PLAIN_OUT %1 ; [in] output buffer +%define %%CIPH_IN %2 ; [in] input buffer +%define %%LAST_CIPH_BLK %3 ; [in/out] ZMM with IV/last cipher blk (in idx 3) +%define %%num_final_blocks %4 ; [in] numerical value (1 - 15) +%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks +%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks +%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks +%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks +%define %%ZT1 %9 ; [clobbered] ZMM temporary +%define %%ZT2 %10 ; [clobbered] ZMM temporary +%define %%ZT3 %11 ; [clobbered] ZMM temporary +%define %%ZT4 %12 ; [clobbered] ZMM temporary +%define %%IA0 %13 ; [clobbered] GP temporary +%define %%NROUNDS %14 ; [in] number of rounds; numerical value + + ;; load plain/cipher text +%ifdef CBCS + ZMM_LOAD_BLOCKS_0_16_OFFSET %%num_final_blocks, %%CIPH_IN, \ + OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%endif + ;; Prepare final cipher text blocks to + ;; be XOR'd later after AESDEC + valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6 +%if %%num_final_blocks > 4 + valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6 +%endif +%if %%num_final_blocks > 8 + valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6 +%endif +%if %%num_final_blocks > 12 + valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6 +%endif + + ;; Update IV with last cipher block + ;; to be used later in DECRYPT_16_PARALLEL +%if %%num_final_blocks == 1 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2 +%elif %%num_final_blocks == 2 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4 +%elif %%num_final_blocks == 3 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6 +%elif %%num_final_blocks == 4 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3 +%elif %%num_final_blocks == 5 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2 +%elif %%num_final_blocks == 6 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4 +%elif %%num_final_blocks == 7 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6 +%elif %%num_final_blocks == 8 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7 +%elif %%num_final_blocks == 9 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2 +%elif %%num_final_blocks == 10 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4 +%elif %%num_final_blocks == 11 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6 +%elif %%num_final_blocks == 12 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11 +%elif %%num_final_blocks == 13 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2 +%elif %%num_final_blocks == 14 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4 +%elif %%num_final_blocks == 15 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6 +%endif + + ;; AES rounds +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \ + ZKEY %+ j, j, no_data, no_data, no_data, no_data, \ + %%num_final_blocks, %%NROUNDS +%assign j (j + 1) +%endrep + + ;; XOR with decrypted blocks to get plain text + vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1 +%if %%num_final_blocks > 4 + vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2 +%endif +%if %%num_final_blocks > 8 + vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3 +%endif +%if %%num_final_blocks > 12 + vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4 +%endif + + ;; write plain text back to output +%ifdef CBCS + ZMM_STORE_BLOCKS_0_16_OFFSET %%num_final_blocks, %%PLAIN_OUT, \ + OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%endif + +%endmacro ; FINAL_BLOCKS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main AES-CBC decrypt macro +;;; - operates on single stream +;;; - decrypts 16 blocks at a time +%macro DECRYPT_16_PARALLEL 14 +%define %%PLAIN_OUT %1 ; [in] output buffer +%define %%CIPH_IN %2 ; [in] input buffer +%define %%LENGTH %3 ; [in/out] number of bytes to process +%define %%LAST_CIPH_BLK %4 ; [in/out] ZMM with IV (first block) or last cipher block (idx 3) +%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks +%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks +%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks +%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks +%define %%ZT1 %9 ; [clobbered] ZMM temporary +%define %%ZT2 %10 ; [clobbered] ZMM temporary +%define %%ZT3 %11 ; [clobbered] ZMM temporary +%define %%ZT4 %12 ; [clobbered] ZMM temporary +%define %%NROUNDS %13 ; [in] number of rounds; numerical value +%define %%IA0 %14 ; [clobbered] GP temporary + +%ifdef CBCS + ZMM_LOAD_BLOCKS_0_16_OFFSET 16, %%CIPH_IN, OFFSET, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + vmovdqu8 %%CIPHER_PLAIN_0_3, [%%CIPH_IN] + vmovdqu8 %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64] + vmovdqu8 %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128] + vmovdqu8 %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192] +%endif + ;; prepare first set of cipher blocks for later XOR'ing + valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6 + valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6 + valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6 + valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6 + + ;; store last cipher text block to be used for next 16 blocks + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15 + + ;; AES rounds +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \ + ZKEY %+ j, j, no_data, no_data, no_data, no_data, \ + 16, %%NROUNDS +%assign j (j + 1) +%endrep + + ;; XOR with decrypted blocks to get plain text + vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1 + vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2 + vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3 + vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4 + + ;; write plain text back to output +%ifdef CBCS + ZMM_STORE_BLOCKS_0_16_OFFSET 16, %%PLAIN_OUT, OFFSET, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + vmovdqu8 [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3 + vmovdqu8 [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7 + vmovdqu8 [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11 + vmovdqu8 [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15 +%endif + ;; adjust input pointer and length + sub %%LENGTH, (16 * 16) + add %%CIPH_IN, (16 * OFFSET) + add %%PLAIN_OUT, (16 * OFFSET) + +%endmacro ; DECRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; AES_CBC_DEC macro decrypts given data. +;;; Flow: +;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks +;;; - Decrypt final blocks (1-15 blocks) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro AES_CBC_DEC 7 +%define %%CIPH_IN %1 ;; [in] pointer to input buffer +%define %%PLAIN_OUT %2 ;; [in] pointer to output buffer +%define %%KEYS %3 ;; [in] pointer to expanded keys +%define %%IV %4 ;; [in] pointer to IV +%define %%LENGTH %5 ;; [in/out] GP register with length in bytes +%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value +%define %%TMP %7 ;; [clobbered] GP register + + cmp %%LENGTH, 0 + je %%cbc_dec_done + + vinserti64x2 zIV, zIV, [%%IV], 3 + + ;; preload keys + LOAD_KEYS %%KEYS, %%NROUNDS + +%%decrypt_16_parallel: + cmp %%LENGTH, 256 + jb %%final_blocks + + DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \ + zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \ + zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP + jmp %%decrypt_16_parallel + +%%final_blocks: + ;; get num final blocks + shr %%LENGTH, 4 + and %%LENGTH, 0xf + je %%cbc_dec_done + + cmp %%LENGTH, 8 + je %%final_num_blocks_is_8 + jl %%final_blocks_is_1_7 + + ; Final blocks 9-15 + cmp %%LENGTH, 12 + je %%final_num_blocks_is_12 + jl %%final_blocks_is_9_11 + + ; Final blocks 13-15 + cmp %%LENGTH, 15 + je %%final_num_blocks_is_15 + cmp %%LENGTH, 14 + je %%final_num_blocks_is_14 + cmp %%LENGTH, 13 + je %%final_num_blocks_is_13 + +%%final_blocks_is_9_11: + cmp %%LENGTH, 11 + je %%final_num_blocks_is_11 + cmp %%LENGTH, 10 + je %%final_num_blocks_is_10 + cmp %%LENGTH, 9 + je %%final_num_blocks_is_9 + +%%final_blocks_is_1_7: + cmp %%LENGTH, 4 + je %%final_num_blocks_is_4 + jl %%final_blocks_is_1_3 + + ; Final blocks 5-7 + cmp %%LENGTH, 7 + je %%final_num_blocks_is_7 + cmp %%LENGTH, 6 + je %%final_num_blocks_is_6 + cmp %%LENGTH, 5 + je %%final_num_blocks_is_5 + +%%final_blocks_is_1_3: + cmp %%LENGTH, 3 + je %%final_num_blocks_is_3 + cmp %%LENGTH, 2 + je %%final_num_blocks_is_2 + jmp %%final_num_blocks_is_1 + + +%%final_num_blocks_is_15: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_14: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_13: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_12: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_11: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_10: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_9: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_8: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_7: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_6: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_5: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_4: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_3: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_2: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_1: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + +%%cbc_dec_done: +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +%ifndef CBCS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_128_vaes_avx512,function,internal +aes_cbc_dec_128_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_192_vaes_avx512,function,internal +aes_cbc_dec_192_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_256_vaes_avx512,function,internal +aes_cbc_dec_256_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp + + ret + +%endif ;; CBCS + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_aes_cbc_dec_256_vaes_avx512 +no_aes_cbc_dec_256_vaes_avx512: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm new file mode 100644 index 000000000..a7fbf39b4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm @@ -0,0 +1,137 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 128 bit CBC AES encrypt +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_128_x4(void *in, +;; uint8_t *IV, +;; uint8_t *keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) +%define EARLY_BLOCKS (2) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_128_x4, function +func(aes_cbc_enc_128_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm new file mode 100644 index 000000000..24ab33fe5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm @@ -0,0 +1,151 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 128 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x8(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;; clobbers all registers except for ARG1 and rbp + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_128_x8, function +func(aes_cbc_enc_128_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm new file mode 100644 index 000000000..b3d80e922 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm @@ -0,0 +1,149 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 192 bit CBC AES encrypt +;;; Updates In and Out pointers at end + +;include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_192_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_192_x4, function +func(aes_cbc_enc_192_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm new file mode 100644 index 000000000..89d233819 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm @@ -0,0 +1,147 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 192 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_192_x8(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;; clobbers all registers except for ARG1 and rbp + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + +mk_global aes_cbc_enc_192_x8, function +func(aes_cbc_enc_192_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm new file mode 100644 index 000000000..ab37668c7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm @@ -0,0 +1,141 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 256 bit CBC AES encrypt +;;; Updates In and Out pointers at end + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_256_x4, function +func(aes_cbc_enc_256_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm new file mode 100644 index 000000000..83e53ac11 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm @@ -0,0 +1,148 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 256 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_256_x8, function +func(aes_cbc_enc_256_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm new file mode 100644 index 000000000..0cc09afe1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm @@ -0,0 +1,102 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +default rel +[bits 64] + +extern aes_cbc_dec_128_sse +extern aes_cbc_dec_128_avx +extern aes_cbc_dec_192_sse +extern aes_cbc_dec_192_avx +extern aes_cbc_dec_256_sse +extern aes_cbc_dec_256_avx + +extern aes_cbc_enc_128_x4 +extern aes_cbc_enc_128_x8 +extern aes_cbc_enc_192_x4 +extern aes_cbc_enc_192_x8 +extern aes_cbc_enc_256_x4 +extern aes_cbc_enc_256_x8 + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_cbc_dec_128_vaes_avx512 +extern aes_cbc_dec_192_vaes_avx512 +extern aes_cbc_dec_256_vaes_avx512 +%endif + +%include "multibinary.asm" + +;;;; +; instantiate aesni_cbc interfaces enc and dec +;;;; +mbin_interface aes_cbc_dec_128 +mbin_dispatch_init7 aes_cbc_dec_128, \ + aes_cbc_dec_128_sse, \ + aes_cbc_dec_128_sse, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_vaes_avx512 + +mbin_interface aes_cbc_dec_192 +mbin_dispatch_init7 aes_cbc_dec_192, \ + aes_cbc_dec_192_sse, \ + aes_cbc_dec_192_sse, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_vaes_avx512 + +mbin_interface aes_cbc_dec_256 +mbin_dispatch_init7 aes_cbc_dec_256, \ + aes_cbc_dec_256_sse, \ + aes_cbc_dec_256_sse, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_vaes_avx512 + +mbin_interface aes_cbc_enc_128 +mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8 +mbin_interface aes_cbc_enc_192 +mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8 +mbin_interface aes_cbc_enc_256 +mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8 + + + +;;; func core, ver, snum +slversion aes_cbc_enc_128, 00, 00, 0291 +slversion aes_cbc_dec_128, 00, 00, 0292 +slversion aes_cbc_enc_192, 00, 00, 0293 +slversion aes_cbc_dec_192, 00, 00, 0294 +slversion aes_cbc_enc_256, 00, 00, 0295 +slversion aes_cbc_dec_256, 00, 00, 0296 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c new file mode 100644 index 000000000..7ae5c9078 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c @@ -0,0 +1,339 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include +#include +#include "ossl_helper.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static unsigned char const ic[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f +}; + +static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext, + *ossl_cyphertext; +static uint8_t test_key[CBC_256_BITS]; + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int aes_128_perf(uint8_t * key) +{ + int i, ret; + + /* Initialize our cipher context, which can use same input vectors */ + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + memcpy(iv, ic, CBC_IV_DATA_LEN); + + aes_cbc_precomp(key, 128, key_data); + aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, + plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_128_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int aes_192_perf(uint8_t * key) +{ + int i, ret; + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + memcpy(iv, ic, CBC_IV_DATA_LEN); + aes_cbc_precomp(key, 192, key_data); + aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, + cyphertext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_192_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int aes_256_perf(uint8_t * key) +{ + int i, ret; + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + aes_cbc_precomp(key, 256, key_data); + memcpy(iv, ic, CBC_IV_DATA_LEN); + aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, + cyphertext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_256_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_256_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + srand(TEST_SEED); + + plaintext = malloc(TEST_LEN); + cbc_plaintext = malloc(TEST_LEN); + cyphertext = malloc(TEST_LEN); + ossl_plaintext = malloc(TEST_LEN); + ossl_cyphertext = malloc(TEST_LEN); + if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext + || NULL == ossl_plaintext || NULL == ossl_cyphertext) { + printf("malloc of testsize:0x%x failed\n", TEST_LEN); + return 1; + } + + mk_rand_data(plaintext, TEST_LEN); + mk_rand_data(test_key, sizeof(test_key)); + printf("AES CBC ISA-L vs OpenSSL performance:\n"); + OK += aes_128_perf(test_key); + OK += aes_192_perf(test_key); + OK += aes_256_perf(test_key); + + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c new file mode 100644 index 000000000..8e8f41792 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c @@ -0,0 +1,56 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include + +int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk) +{ + if (CBC_128_BITS == key_size) { + aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else if (CBC_192_BITS == key_size) { + aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else if (CBC_256_BITS == key_size) { + aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else { + //Invalid key length + return 1; + } + return 0; +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver aes_cbc_precomp_slver_00000297; +struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h new file mode 100644 index 000000000..7bebcaed4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h @@ -0,0 +1,466 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_CBC_STD_VECTORS_H_ +#define AES_CBC_STD_VECTORS_H_ +#include + + +// struct to hold pointers to the cbc data vectors +struct cbc_vector { + uint8_t* K; // AES Key + cbc_key_size K_LEN; // length of key in bits + uint8_t* IV; // initial value used by GCM + uint64_t P_LEN; // length of our plaintext + uint8_t* P; // Plain text + //outputs of encryption + uint8_t* EXP_C; // same length as P + // used in vector checks, not populated in std vector array + uint8_t *C; + struct cbc_key_data *KEYS; +}; + + +/////////////////////////////////////////// +// Test vectors from: +// Intel IPSec library 1..3 +// +/////////////////////////////////////////// +static unsigned char K1[] = { + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c +}; +static unsigned char IV1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P1[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C1[] = { + 0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d, + 0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2, + 0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16, + 0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7 +}; + +static unsigned char K2[] = { + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4 +}; +static unsigned char IV2[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P2[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C2[] = { + 0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6, + 0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d, + 0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61, + 0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b +}; + +static unsigned char K3[] = { + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7 +}; +static unsigned char IV3[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P3[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C3[] = { + 0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8, + 0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa, + 0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2, + 0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03, + 0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea +}; + +/////////////////////////////////////////// +// Test vectors from: +// 'https://tools.ietf.org/html/rfc3602#section-3.2' +// The AES-CBC Cipher Algorithm and Its Use with IPsec +// +/////////////////////////////////////////// +/* +Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key +Key : 0x06a9214036b8a15b512e03d534120006 +IV : 0x3dafba429d9eb430b422da802c9fac41 +Plaintext : "Single block msg" +Ciphertext: 0xe353779c1079aeb82708942dbe77181a + * + */ +static unsigned char K4[] = { + 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 +}; +static unsigned char IV4[] = { + 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 +}; +static unsigned char P4[] = { + "Single block msg" +}; +static unsigned char C4[] = { + 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a +}; + +/* +Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key +Key : 0xc286696d887c9aa0611bbb3e2025a45a +IV : 0x562e17996d093d28ddb3ba695a2e6f58 +Plaintext : 0x000102030405060708090a0b0c0d0e0f + 101112131415161718191a1b1c1d1e1f +Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a + 7586602d253cfff91b8266bea6d61ab1 +*/ +static unsigned char K5[] = { + 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a +}; +static unsigned char IV5[] = { + 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 +}; +static unsigned char P5[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, + 0x1c, 0x1d, 0x1e, 0x1f +}; +static unsigned char C5[] = { + 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, + 0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe, + 0xa6, 0xd6, 0x1a, 0xb1 +}; + +/* +Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key +Key : 0x6c3ea0477630ce21a2ce334aa746c2cd +IV : 0xc782dc4c098c66cbd9cd27d825682c81 +Plaintext : "This is a 48-byte message (exactly 3 AES blocks)" +Ciphertext: 0xd0a02b3836451753d493665d33f0e886 + 2dea54cdb293abc7506939276772f8d5 + 021c19216bad525c8579695d83ba2684 + + */ +static unsigned char K6[] = { + 0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd +}; +static unsigned char IV6[] = { + 0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81 +}; +static unsigned char P6[] = { + "This is a 48-byte message (exactly 3 AES blocks)" +}; +static unsigned char C6[] = { + 0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86, + 0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5, + 0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84 +}; + +/* +Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key +Key : 0x56e47a38c5598974bc46903dba290349 +IV : 0x8ce82eefbea0da3c44699ed7db51b7d9 +Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf + b0b1b2b3b4b5b6b7b8b9babbbcbdbebf + c0c1c2c3c4c5c6c7c8c9cacbcccdcecf + d0d1d2d3d4d5d6d7d8d9dadbdcdddedf +Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa + 0f3af07a9a31a9c684db207eb0ef8e4e + 35907aa632c3ffdf868bb7b29d3d46ad + 83ce9f9a102ee99d49a53e87f4c3da55 + */ +static unsigned char K7[] = { + 0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49 +}; +static unsigned char IV7[] = { + 0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9 +}; +static unsigned char P7[] = { + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf +}; +static unsigned char C7[] = { + 0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa, + 0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e, + 0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad, + 0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55 +}; + +/* +Case #5: Sample transport-mode ESP packet (ping 192.168.123.100) +Key: 90d382b4 10eeba7a d938c46c ec1a82bf +SPI: 4321 +Source address: 192.168.123.3 +Destination address: 192.168.123.100 +Sequence number: 1 +IV: e96e8c08 ab465763 fd098d45 dd3ff893 + +Original packet: +IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64 +Data (64 bytes): +08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 + +Augment data with: +Padding: 01020304 05060708 090a0b0c 0d0e +Pad length: 0e +Next header: 01 (ICMP) + +Pre-encryption Data with padding, pad length and next header (80 bytes): +08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 +01020304 05060708 090a0b0c 0d0e0e01 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64 +SPI/Seq #: 00004321 00000001 +IV: e96e8c08 ab465763 fd098d45 dd3ff893 +Encrypted Data (80 bytes): +f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856 +a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a +a269add0 47ad2d59 13ac19b7 cfbad4a6 +*/ +static unsigned char K8[] = { + 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf +}; +static unsigned char IV8[] = { + 0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93 +}; +static unsigned char P8[] = { + 0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01 +}; +static unsigned char C8[] = { + 0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49, + 0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56, + 0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52, + 0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a, + 0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6 +}; + +/* +Case #6: Sample transport-mode ESP packet + (ping -p 77 -s 20 192.168.123.100) +Key: 90d382b4 10eeba7a d938c46c ec1a82bf +SPI: 4321 +Source address: 192.168.123.3 +Destination address: 192.168.123.100 +Sequence number: 8 +IV: 69d08df7 d203329d b093fc49 24e5bd80 + +Original packet: +IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64 +Data (28 bytes): +0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 + +Augment data with: +Padding: 0102 +Pad length: 02 +Next header: 01 (ICMP) + +Pre-encryption Data with padding, pad length and next header (32 bytes): +0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64 +SPI/Seq #: 00004321 00000008 +IV: 69d08df7 d203329d b093fc49 24e5bd80 +Encrypted Data (32 bytes): +f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75 + */ +static unsigned char K9[] = { + 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf +}; +static unsigned char IV9[] = { + 0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80 +}; +static unsigned char P9[] = { + 0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00, + 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01 +}; +static unsigned char C9[] = { + 0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09, + 0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75 +}; + +/* +Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200) +Key: 01234567 89abcdef 01234567 89abcdef +SPI: 8765 +Source address: 192.168.123.3 +Destination address: 192.168.123.200 +Sequence number: 2 +IV: f4e76524 4f6407ad f13dc138 0f673f37 + +Original packet: +IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8 +Data (64 bytes): +08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 + +Augment data with: +Padding: 01020304 05060708 090a +Pad length: 0a +Next header: 04 (IP-in-IP) + +Pre-encryption Data with original IP header, padding, pad length and + next header (96 bytes): +45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d +02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223 +24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04 + + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8 +SPI/Seq #: 00008765 00000002 +IV: f4e76524 4f6407ad f13dc138 0f673f37 +Encrypted Data (96 bytes): +773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8 +e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5 +c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d + */ +static unsigned char K10[] = { + 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef +}; +static unsigned char IV10[] = { + 0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37 +}; +static unsigned char P10[] = { + 0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03, + 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d, + 0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, + 0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04 + +}; +static unsigned char C10[] = { + 0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c, + 0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8, + 0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6, + 0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5, + 0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a, + 0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d +}; + +/* +Case #8: Sample tunnel-mode ESP packet + (ping -p ff -s 40 192.168.123.200) +Key: 01234567 89abcdef 01234567 89abcdef +SPI: 8765 +Source address: 192.168.123.3 +Destination address: 192.168.123.200 +Sequence number: 5 +IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22 + +Original packet: +IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8 +Data (48 bytes): +0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff +ffffffff ffffffff ffffffff ffffffff + +Augment data with: +Padding: 01020304 05060708 090a +Pad length: 0a +Next header: 04 (IP-in-IP) + +Pre-encryption Data with original IP header, padding, pad length and + next header (80 bytes): +45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d +a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff +ffffffff 01020304 05060708 090a0a04 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8 +SPI/Seq #: 00008765 00000005 +IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22 +Encrypted Data (80 bytes): +15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5 +0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3 +d2726d9b 5ef6affc 6d17a0de cbb13892 + */ +static unsigned char K11[] = { + 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef +}; +static unsigned char IV11[] = { + 0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22 +}; +static unsigned char P11[] = { + 0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03, + 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d, + 0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04 +}; +static unsigned char C11[] = { + 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f, + 0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5, + 0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39, + 0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3, + 0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92 +}; + + +#define min_size(a, b) (((a)<(b))?(a):(b)) +// Plain and cypher text will be the same size +// Those vectors using strings for plain text have an extra null terminator that needs +// to be ignored +#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C)))) +#define CBC_KEY_LEN(kdata) (sizeof(kdata)) + +//field order {K, Klen, IV, Plen, P, C}; +#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/} +struct cbc_vector const cbc_vectors[] = { + vector(1), + vector(2), + vector(3), + vector(4), + vector(5), + vector(6), + vector(7), + vector(8), + vector(9), + vector(10), + vector(11), +}; + +#endif /* AES_CBC_STD_VECTORS_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c new file mode 100644 index 000000000..aa9412c35 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c @@ -0,0 +1,443 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "types.h" +#include "ossl_helper.h" +#include "cbc_std_vectors.h" + +//define CBC_VECTORS_VERBOSE +//define CBC_VECTORS_EXTRA_VERBOSE + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 100 +#endif +#ifndef TEST_LEN +# define TEST_LEN (8*1024*1024) +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif +#ifndef MAX_UNALINED +# define MAX_UNALINED (16) +#endif + +static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS }; + +typedef void (*aes_cbc_generic)(uint8_t * in, + uint8_t * IV, + uint8_t * keys, uint8_t * out, uint64_t len_bytes); + +int OpenSslEnc(uint8_t k_len, + uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes) +{ + if (CBC_128_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL128 "); +#endif + openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_192_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL192 "); +#endif + openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_256_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL256 "); + fflush(0); +#endif + openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else { + fprintf(stderr, "Invalid key length: %d\n", k_len); + return 1; + } + return 0; +} + +int OpenSslDec(uint8_t k_len, + uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes) +{ + if (CBC_128_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL128 "); +#endif + openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_192_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL192 "); +#endif + openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_256_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL256 "); +#endif + openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else { + fprintf(stderr, "Invalid key length: %d\n", k_len); + return 1; + } + return 0; +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + uint64_t a; + + mismatch = memcmp(test, expected, len); + if (!mismatch) { + return OK; + + } else { + OK = 1; + printf(" failed %s \t\t", data_name); + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + return OK; +} + +int check_vector(struct cbc_vector *vector) +{ + uint8_t *pt_test = NULL; + uint8_t *o_ct_test = NULL; + int OK = 0; + aes_cbc_generic enc; + aes_cbc_generic dec; + +#ifdef CBC_VECTORS_VERBOSE + printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN); +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C, + vector->IV, vector->EXP_C, vector->KEYS); +#endif + fflush(0); +#else + printf("."); +#endif + + if (CBC_128_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_128; + dec = (aes_cbc_generic) & aes_cbc_dec_128; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC128 "); +#endif + } else if (CBC_192_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_192; + dec = (aes_cbc_generic) & aes_cbc_dec_192; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC192 "); +#endif + } else if (CBC_256_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_256; + dec = (aes_cbc_generic) & aes_cbc_dec_256; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC256 "); +#endif + } else { + printf("Invalid key length: %d\n", vector->K_LEN); + return 1; + } + + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->P_LEN); + o_ct_test = malloc(vector->P_LEN); + if ((pt_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + + aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS); + +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + //// + // ISA-l Encrypt + //// + enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN); + if (NULL != vector->EXP_C) { //when the encrypted text is know verify correct + OK |= + check_data(vector->EXP_C, vector->C, vector->P_LEN, + "ISA-L expected cypher text (C)"); + } + OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN); + OK |= + check_data(vector->C, o_ct_test, vector->P_LEN, + "OpenSSL vs ISA-L cypher text (C)"); + + memcpy(pt_test, vector->P, vector->P_LEN); + memset(vector->P, 0, vector->P_LEN); +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + + //// + // ISA-l Decrypt + //// + dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->P_LEN); + dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)"); + memset(vector->P, 0, vector->P_LEN); + OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)"); +#ifdef CBC_VECTORS_VERBOSE + if (OK) + printf("Failed"); + else + printf("Passed"); + + printf("\n"); +#endif + + return OK; +} + +int test_std_combinations(void) +{ + int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]); + int i, ret; + uint8_t *iv = NULL; + + printf("AES CBC standard test vectors:"); +#ifdef CBC_VECTORS_VERBOSE + printf("\n"); +#endif + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == iv)) + return 1; + + for (i = 0; (i < vectors_cnt); i++) { + struct cbc_vector vect = cbc_vectors[i]; + + ret = posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS))); + if ((0 != ret) || (NULL == vect.KEYS)) + return 1; + // IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer + memcpy(iv, vect.IV, CBC_IV_DATA_LEN); + vect.IV = iv; + vect.C = NULL; + vect.C = malloc(vect.P_LEN); + if ((NULL == vect.C)) + return 1; +#ifdef CBC_VECTORS_VERBOSE + printf("vector[%d of %d] ", i, vectors_cnt); +#endif + if (0 == (i % 25)) + printf("\n"); + if (0 == (i % 10)) + fflush(0); + + if (0 != check_vector(&vect)) + return 1; + + aligned_free(vect.KEYS); + free(vect.C); + } + + aligned_free(iv); + printf("\n"); + return 0; +} + +int test_random_combinations(void) +{ + struct cbc_vector test; + int t, ret; + + printf("AES CBC random test vectors:"); +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + test.IV = NULL; + ret = posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == test.IV)) + return 1; + test.KEYS = NULL; + ret = posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS))); + if ((0 != ret) || (NULL == test.KEYS)) + return 1; + + for (t = 0; RANDOMS > t; t++) { + int Plen = 16 + ((rand() % TEST_LEN) & ~0xf); //must be a 16byte multiple + int offset = (rand() % MAX_UNALINED); + int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0]))); // select one of the valid key sizes + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + + test.C = NULL; + test.P = NULL; + test.K = NULL; + test.EXP_C = NULL; + test.P_LEN = Plen; + test.K_LEN = Ksize[Kindex]; + + test.P = malloc(test.P_LEN + offset); + test.C = malloc(test.P_LEN + offset); + test.K = malloc(test.K_LEN + offset); + if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return -1; + } + test.P += offset; + test.C += offset; + test.K += offset; + + mk_rand_data(test.P, test.P_LEN); + mk_rand_data(test.K, test.K_LEN); + mk_rand_data(test.IV, CBC_IV_DATA_LEN); + +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" Offset:0x%x ", offset); +#endif + if (0 != check_vector(&test)) + return 1; + + test.C -= offset; + free(test.C); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + } + + aligned_free(test.IV); + aligned_free(test.KEYS); + printf("\n"); + return 0; +} + +int test_efence_combinations(void) +{ + struct cbc_vector test; + int offset = 0; + int key_idx; + uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL; + uint8_t *key_data = NULL; + + P = malloc(PAGE_LEN); + C = malloc(PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + key_data = malloc(PAGE_LEN); + + if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) + || (NULL == key_data) + ) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + // place buffers to end at page boundary + test.P_LEN = PAGE_LEN / 2; + test.EXP_C = NULL; + + printf("AES CBC efence test vectors:"); + for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) { + test.K_LEN = Ksize[key_idx]; + + for (offset = 0; MAX_UNALINED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff); // must be a multiple of 16 + if (16 > test.P_LEN) + test.P_LEN = 16; + //Place data at end of page + test.P = P + PAGE_LEN - test.P_LEN - offset; + test.C = C + PAGE_LEN - test.P_LEN - offset; + test.K = K + PAGE_LEN - test.K_LEN - offset; + test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset; + test.IV = test.IV - ((uint64_t) test.IV & 0xff); // align to 16 byte boundary + test.KEYS = (struct cbc_key_data *) + (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset); + test.KEYS = (struct cbc_key_data *) + ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff)); // align to 16 byte boundary + + mk_rand_data(test.P, test.P_LEN); + mk_rand_data(test.K, test.K_LEN); + mk_rand_data(test.IV, CBC_IV_DATA_LEN); +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" Offset:0x%x ", offset); +#endif + if (0 != check_vector(&test)) + return 1; + } + + } + + free(P); + free(C); + free(K); + free(IV); + free(key_data); + printf("\n"); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + srand(TEST_SEED); + OK |= test_std_combinations(); + OK |= test_random_combinations(); + OK |= test_efence_combinations(); + if (0 == OK) { + printf("...Pass\n"); + } else { + printf("...Fail\n"); + } + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c new file mode 100644 index 000000000..0558b4254 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c @@ -0,0 +1,183 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * Run list of standard CBC test vectors through encode and decode checks. + */ + +#include +#include +#include +#include +#include +#include "types.h" +#include "cbc_std_vectors.h" + +typedef void (*aes_cbc_generic)(uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out, + uint64_t len_bytes); + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + uint64_t a; + + mismatch = memcmp(test, expected, len); + if (!mismatch) { + return OK; + + } else { + OK = 1; + printf(" failed %s \t\t", data_name); + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + return OK; +} + +int check_vector(struct cbc_vector *vector) +{ + uint8_t *pt_test = NULL; + int OK = 0; + aes_cbc_generic enc; + aes_cbc_generic dec; + + DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN)); + DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C, + vector->IV, vector->EXP_C, vector->KEYS)); + printf("."); + + switch (vector->K_LEN) { + case CBC_128_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_128; + dec = (aes_cbc_generic) & aes_cbc_dec_128; + DEBUG_PRINT((" CBC128 ")); + break; + case CBC_192_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_192; + dec = (aes_cbc_generic) & aes_cbc_dec_192; + DEBUG_PRINT((" CBC192 ")); + break; + case CBC_256_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_256; + dec = (aes_cbc_generic) & aes_cbc_dec_256; + DEBUG_PRINT((" CBC256 ")); + break; + default: + printf("Invalid key length: %d\n", vector->K_LEN); + return 1; + } + + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->P_LEN); + + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + + aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS); + + //// + // ISA-l Encrypt + //// + enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN); + + if (NULL != vector->EXP_C) { //when the encrypted text is known verify correct + OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN, + "ISA-L expected cypher text (C)"); + } + memcpy(pt_test, vector->P, vector->P_LEN); + memset(vector->P, 0, vector->P_LEN); + + //// + // ISA-l Decrypt + //// + dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)"); + DEBUG_PRINT((OK ? "Failed\n" : "Passed\n")); + + free(pt_test); + return OK; +} + +int test_std_combinations(void) +{ + int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]); + int i, ret; + uint8_t *iv = NULL; + + printf("AES CBC standard test vectors: "); + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == iv)) + return 1; + + for (i = 0; (i < vectors_cnt); i++) { + struct cbc_vector vect = cbc_vectors[i]; + + ret = posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS)); + if ((0 != ret) || (NULL == vect.KEYS)) + return 1; + + // IV data must be aligned to 16 byte boundary so move data in + // aligned buffer and change out the pointer + memcpy(iv, vect.IV, CBC_IV_DATA_LEN); + vect.IV = iv; + vect.C = malloc(vect.P_LEN); + if (NULL == vect.C) + return 1; + + DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt)); + + if (0 != check_vector(&vect)) + return 1; + + aligned_free(vect.KEYS); + free(vect.C); + } + + aligned_free(iv); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + OK = test_std_combinations(); + + printf(0 == OK ? "Pass\n" : "Fail\n"); + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm new file mode 100644 index 000000000..2c80401e9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm @@ -0,0 +1,202 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _CLEAR_REGS_ASM_ +%define _CLEAR_REGS_ASM_ + +%ifndef LINUX +%ifidn __OUTPUT_FORMAT__, elf64 +%define LINUX +%endif +%endif + +; +; This macro clears any GP registers passed +; +%macro clear_gps 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + xor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on SSE +; +%macro clear_xmms_sse 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + pxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on AVX +; +%macro clear_xmms_avx 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any YMM registers passed +; +%macro clear_ymms 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any ZMM registers passed +; +%macro clear_zmms 1-32 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxorq %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears all scratch GP registers +; for Windows or Linux +; +%macro clear_scratch_gps_asm 0 + clear_gps rax, rcx, rdx, r8, r9, r10, r11 +%ifdef LINUX + clear_gps rdi, rsi +%endif +%endmacro + +; +; This macro clears all scratch XMM registers on SSE +; +%macro clear_scratch_xmms_sse_asm 0 +%ifdef LINUX +%assign i 0 +%rep 16 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch XMM registers on AVX +; +%macro clear_scratch_xmms_avx_asm 0 +%ifdef LINUX + vzeroall +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + vpxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch YMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15) +; +%macro clear_scratch_ymms_asm 0 +; On Linux, all YMM registers are scratch registers +%ifdef LINUX + vzeroall +; On Windows, YMM0-YMM5 registers are scratch registers. +; YMM6-YMM15 upper 128 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxor ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch ZMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15). YMM registers are used +; on purpose, since XOR'ing YMM registers is faster +; than XOR'ing ZMM registers, and the operation clears +; also the upper 256 bits +; +%macro clear_scratch_zmms_asm 0 +; On Linux, all ZMM registers are scratch registers +%ifdef LINUX + vzeroall + ;; vzeroall only clears the first 16 ZMM registers +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers. +; ZMM6-ZMM15 upper 384 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep + +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +%endif ;; _CLEAR_REGS_ASM diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm new file mode 100644 index 000000000..98304c552 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm new file mode 100644 index 000000000..5ee5e7b48 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm new file mode 100644 index 000000000..902c17237 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm new file mode 100644 index 000000000..1e55d24cf --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm new file mode 100644 index 000000000..1717a8662 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm new file mode 100644 index 000000000..d17402bea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm new file mode 100644 index 000000000..71f284789 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm @@ -0,0 +1,32 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +;; single buffer implementation +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm new file mode 100644 index 000000000..c0c587133 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm new file mode 100644 index 000000000..4b159cefb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm new file mode 100644 index 000000000..822ef07cc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm new file mode 100644 index 000000000..f6050a8ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm new file mode 100644 index 000000000..5959d698f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm new file mode 100644 index 000000000..c583d02b8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm new file mode 100644 index 000000000..5952a6005 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm new file mode 100644 index 000000000..bd318fcd1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm @@ -0,0 +1,32 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +;; single buffer implementation +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm new file mode 100644 index 000000000..da2f611b4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm new file mode 100644 index 000000000..90db18910 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm @@ -0,0 +1,2130 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen2.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 13 +%endif + +default rel +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba + vpshufd %%T2, %%GH, 01001110b + vpshufd %%T3, %%HK, 01001110b + vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0) + vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0) + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + vpxor %%T2, %%T2, %%GH + vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs + vpxor %%GH, %%GH, %%T3 + vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK + + ;first phase of the reduction + vpslld %%T2, %%GH, 31 ; packed right shifting << 31 + vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%GH,1 ; packed left shifting >> 1 + vpsrld %%T3,%%GH,2 ; packed left shifting >> 2 + vpsrld %%T4,%%GH,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpxor %%T2, %%T2, %%T5 + vpxor %%GH, %%GH, %%T2 + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: +; GDATA_KEY - struct gcm_key_data * +; GDATA_CTX - struct gcm_context_data * +; PLAIN_CYPH_IN - input text +; PLAIN_CYPH_LEN - input text length +; DATA_OFFSET - the current data offset +; ENC_DEC - whether encoding or decoding +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + cmp r13, rax + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX+PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + ; start AES for %%num_initial_blocks blocks + vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep ; NROUNDS + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + vpxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + vmovdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM1, %%CTR + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM2, %%CTR + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM3, %%CTR + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM4, %%CTR + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM5, %%CTR + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM6, %%CTR + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM7, %%CTR + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM8, %%CTR + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%T_key + vpxor %%XMM2, %%T_key + vpxor %%XMM3, %%T_key + vpxor %%XMM4, %%T_key + vpxor %%XMM5, %%T_key + vpxor %%XMM6, %%T_key + vpxor %%XMM7, %%T_key + vpxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; r11 is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONE] + vpaddd %%XMM3, %%XMM2, [ONE] + vpaddd %%XMM4, %%XMM3, [ONE] + vpaddd %%XMM5, %%XMM4, [ONE] + vpaddd %%XMM6, %%XMM5, [ONE] + vpaddd %%XMM7, %%XMM6, [ONE] + vpaddd %%XMM8, %%XMM7, [ONE] + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONEf] + vpaddd %%XMM3, %%XMM2, [ONEf] + vpaddd %%XMM4, %%XMM3, [ONEf] + vpaddd %%XMM5, %%XMM4, [ONEf] + vpaddd %%XMM6, %%XMM5, [ONEf] + vpaddd %%XMM7, %%XMM6, [ONEf] + vpaddd %%XMM8, %%XMM7, [ONEf] + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%T1 + vpxor %%XMM2, %%T1 + vpxor %%XMM3, %%T1 + vpxor %%XMM4, %%T1 + vpxor %%XMM5, %%T1 + vpxor %%XMM6, %%T1 + vpxor %%XMM7, %%T1 + vpxor %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + + vpshufd %%T6, %%T2, 01001110b + vpxor %%T6, %%T2 + + vmovdqu %%T5, [%%GDATA + HashKey_8_k] + vpclmulqdq %%T6, %%T6, %%T5, 0x00 ; + + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpxor %%T6, %%T4 + vpxor %%T6, %%T7 + +%ifdef GCM128_MODE + vmovdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif + +%assign i 0 +%assign j 1 +%rep 8 + +%ifidn %%ENC_DEC, ENC +%ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 +%else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] +%endif ; NT_LD + vaesenclast reg(j), reg(j), %%T2 +%else + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3 +%endif ; %%ENC_DEC + +%assign i (i+1) +%assign j (j+1) +%endrep + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T3 + vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7 + + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + %ifidn %%ENC_DEC, ENC + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer + %endif + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] + vpshufb %%XMM3, [SHUF_MASK] + vpshufb %%XMM4, [SHUF_MASK] + vpshufb %%XMM5, [SHUF_MASK] + vpshufb %%XMM6, [SHUF_MASK] + vpshufb %%XMM7, [SHUF_MASK] + vpshufb %%XMM8, [SHUF_MASK] + + + vpxor %%XMM1, %%T6 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +; %%GDATA is GCM key data +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + ;; Karatsuba Method + + + vpshufd %%T2, %%XMM1, 01001110b + vpxor %%T2, %%XMM1 + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vmovdqu %%T3, [%%GDATA + HashKey_8_k] + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM2, 01001110b + vpxor %%T2, %%XMM2 + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM3, 01001110b + vpxor %%T2, %%XMM3 + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM4, 01001110b + vpxor %%T2, %%XMM4 + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM5, 01001110b + vpxor %%T2, %%XMM5 + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM6, 01001110b + vpxor %%T2, %%XMM6 + vmovdqu %%T5, [%%GDATA + HashKey_3] + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM7, 01001110b + vpxor %%T2, %%XMM7 + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM8, 01001110b + vpxor %%T2, %%XMM8 + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T4 + vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + +%endmacro + + +; Encryption of a single block +; %%GDATA is GCM key data +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep ; NROUNDS + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX), +; IV, Additional Authentication data (A_IN), Additional +; Data length (A_LEN) +; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + vpxor xmm2, xmm3 + mov r10, %%A_LEN + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX), +; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET + add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN ; Update length of data processed + vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + + mov r13, %%PLAIN_CYPH_LEN + sub r13, %%DATA_OFFSET + mov r10, r13 ; save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14 + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + vpaddd xmm9, [ONE] ; INCR CNT to get Yn + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vpshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn) + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: +%ifidn %%ENC_DEC, DEC + vmovdqa xmm2, xmm1 + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpand xmm2, xmm1 + vpshufb xmm2, [SHUF_MASK] + vpxor xmm14, xmm2 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpshufb xmm9, [SHUF_MASK] + vpxor xmm14, xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + + vpxor xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen2 +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, 1 + vpsrlq xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, [POLY] + vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push r12 + push r13 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + vmovdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm new file mode 100644 index 000000000..4a0b4f82e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm @@ -0,0 +1,3277 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen4.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +section .text +default rel + +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxor %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxor %%T1, %%T1, %%T3 + vpxor %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + +%endmacro + + +; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 +; functions, but are kept to allow users to switch cpu architectures between calls +; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + ;; NOTE: in current implementation check for zero length is obsolete here. + ;; The adequate checks are done by callers of this macro. + ;; cmp %%COUNTER, 0 + ;; je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 23 +%define %%GDATA_KEY %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%LENGTH %4 +%define %%DATA_OFFSET %5 +%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 +%define %%CTR %12 +%define %%XMM1 %13 +%define %%XMM2 %14 +%define %%XMM3 %15 +%define %%XMM4 %16 +%define %%XMM5 %17 +%define %%XMM6 %18 +%define %%XMM7 %19 +%define %%XMM8 %20 +%define %%T6 %21 +%define %%T_key %22 +%define %%ENC_DEC %23 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; Start AES for %%num_initial_blocks blocks + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +%if(%%num_initial_blocks>0) +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%endif ; %if(%%num_initial_blocks>0) + + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; Write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [SHUF_MASK] +%assign i (i+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%if(%%num_initial_blocks>0) + vmovdqa %%T3, reg(i) +%assign i (i+1) + +%rep %%num_initial_blocks-1 + vmovdqu [rsp + TMP %+ i], reg(i) +%assign i (i+1) +%endrep +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Haskey_i_k holds XORed values of the low and high parts of + ;; the Haskey_i + vpaddd %%XMM1, %%CTR, [ONE] ; INCR Y0 + vpaddd %%XMM2, %%CTR, [TWO] ; INCR Y0 + vpaddd %%XMM3, %%XMM1, [TWO] ; INCR Y0 + vpaddd %%XMM4, %%XMM2, [TWO] ; INCR Y0 + vpaddd %%XMM5, %%XMM3, [TWO] ; INCR Y0 + vpaddd %%XMM6, %%XMM4, [TWO] ; INCR Y0 + vpaddd %%XMM7, %%XMM5, [TWO] ; INCR Y0 + vpaddd %%XMM8, %%XMM6, [TWO] ; INCR Y0 + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%XMM1, %%T_key + vpxor %%XMM2, %%XMM2, %%T_key + vpxor %%XMM3, %%XMM3, %%T_key + vpxor %%XMM4, %%XMM4, %%T_key + vpxor %%XMM5, %%XMM5, %%T_key + vpxor %%XMM6, %%XMM6, %%T_key + vpxor %%XMM7, %%XMM7, %%T_key + vpxor %%XMM8, %%XMM8, %%T_key + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) + +%define %%T4_2 %%T4 +%if(%%num_initial_blocks>0) + ;; Hash in AES state + ;; T2 - incoming AAD hash + vpxor %%T2, %%T3 + + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*1] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*2] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>1) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*3] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*4] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>2) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>3) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*5] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*6] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>4) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*7] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*8] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>5) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*9] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%ifndef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>6) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif + +%ifdef GCM192_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif +%ifdef GCM256_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>7) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM256_MODE ; GCM256 + vmovdqu %%T_key, [%%GDATA_KEY+16*13] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*14] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif ; GCM256 mode + +%if(%%num_initial_blocks>0) + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + ;; First phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + ;; Second phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; The result is in %%T3 + vpxor %%T3, %%T1, %%T4 +%else + ;; The hash should end up in T3 + vmovdqa %%T3, %%T2 +%endif + + ;; Final hash is now in T3 +%if %%num_initial_blocks > 0 + ;; NOTE: obsolete in case %%num_initial_blocks = 0 + sub %%LENGTH, 16*%%num_initial_blocks +%endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + +%if %%num_initial_blocks > 0 + ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; If the last block is partial then the xor will be done later + ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. + ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jl %%_initial_skip_last_word_write +%endif + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + ;; Update %%LENGTH with the number of blocks processed + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 +%%_initial_skip_last_word_write: + sub %%LENGTH, 128-16 + add %%DATA_OFFSET, 128-16 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + ;; Combine GHASHed value with the corresponding ciphertext + vpxor %%XMM1, %%XMM1, %%T3 + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + +;;; INITIAL_BLOCKS macro with support for a partial final block. +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 25 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) +%define %%T1 %8 +%define %%T2 %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 +%define %%INSTANCE_TYPE %25 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ;; Compute AES counters + vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ; Start AES for %%num_initial_blocks blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%if %%num_initial_blocks > 0 +%rep %%num_initial_blocks-1 + ;; Encrypt the message for all but the last block + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] +%assign i (i+1) +%endrep +%endif + ;; The final block of data may be <16B + sub %%LENGTH, 16*(%%num_initial_blocks-1) + +%if %%num_initial_blocks < 8 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Encrypt the message + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] + + ;; Hash all of the data +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%assign rep_count (%%num_initial_blocks-1) +%if rep_count > 0 +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep +%endif + ;; Record that a reduction is needed + mov r12, 1 + + jmp %%_small_initial_compute_hash + + +%endif ; %if %%num_initial_blocks < 8 + +%%_small_initial_partial_block: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle ghash for a <16B final block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) + ;; Handle a partial final block + ;; GDATA, KEY, T1, T2 + ;; r13 - length + ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long + ;; NOTE: could be replaced with %%LENGTH but at this point + ;; %%LENGTH is always less than 16. + ;; No PLAIN_CYPH_LEN argument available in this macro. + ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET + vpshufb reg(i), [SHUF_MASK] + +%ifidn %%INSTANCE_TYPE, multi_call +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks-1) +%assign last_block_to_hash 1 +%else +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Record that a reduction is needed + mov r12, 1 + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + mov r12, 0 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%ifidn %%INSTANCE_TYPE, multi_call +%assign rep_count (%%num_initial_blocks-2) +%%_multi_call_hash: +%else +%assign rep_count (%%num_initial_blocks-1) +%endif +%if rep_count > 0 +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep +%endif + +%%_small_initial_compute_hash: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Ghash reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + cmp r12, 0 + je %%_no_reduction_needed +%endif +%endif + + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + ;; shift-L xmm2 2 DWs + vpslldq %%T2, %%T2, 8 + vpxor %%T4, %%T4, %%T2 + + ;; First phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Second phase of the reduction + + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + + vpxor %%T4, %%T4, %%T2 + ;; Second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T3, %%T1, %%T4 + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 8 + ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero + cmp qword [%%GDATA_CTX + PBlockLen], 0 + je %%_no_partial_block_xor +%endif ; %%num_initial_blocks != 8 + vpxor %%T3, %%T3, reg(8) +%%_no_partial_block_xor: +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%_no_reduction_needed case only valid for + ;; multi_call with initial_blocks = 1. + ;; Look for comment above around '_no_reduction_needed' + ;; The jmp below is obsolete as the code will fall through. + + ;; The result is in %%T3 + jmp %%_after_reduction + +%%_no_reduction_needed: + ;; The hash should end up in T3. The only way we should get here is if + ;; there is a partial block of data, so xor that into the hash. + vpxor %%T3, %%T2, reg(8) +%endif ; %%INSTANCE_TYPE = multi_call +%endif ; %%num_initial_blocks=1 + +%%_after_reduction: + ;; Final hash is now in T3 + +%endmacro ; INITIAL_BLOCKS_PARTIAL + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 23 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 +%define %%FULL_PARTIAL %23 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vmovdqu %%T5, [TWO] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 + + vmovdqu %%T5, [SHUF_MASK] + vpshufb %%XMM1, %%T5 ; perform a 16Byte swap + vpshufb %%XMM2, %%T5 ; perform a 16Byte swap + vpshufb %%XMM3, %%T5 ; perform a 16Byte swap + vpshufb %%XMM4, %%T5 ; perform a 16Byte swap + vpshufb %%XMM5, %%T5 ; perform a 16Byte swap + vpshufb %%XMM6, %%T5 ; perform a 16Byte swap + vpshufb %%XMM7, %%T5 ; perform a 16Byte swap + vpshufb %%XMM8, %%T5 ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vmovdqu %%T5, [TWOf] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + vpxor %%XMM2, %%XMM2, %%T1 + vpxor %%XMM3, %%XMM3, %%T1 + vpxor %%XMM4, %%XMM4, %%T1 + vpxor %%XMM5, %%XMM5, %%T1 + vpxor %%XMM6, %%XMM6, %%T1 + vpxor %%XMM7, %%XMM7, %%T1 + vpxor %%XMM8, %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T1, %%T4, %%T3 + + + vmovdqu %%T5, [%%GDATA + 16*10] + %ifndef GCM128_MODE ; GCM192 or GCM256 + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif ; GCM256 + +%assign i 0 +%assign j 1 +%rep 8 + + ;; SNP TBD: This is pretty ugly - consider whether just XORing the + ;; data in after vaesenclast is simpler and performant. Would + ;; also have to ripple it through partial block and ghash_mul_8. +%ifidn %%FULL_PARTIAL, full + %ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %endif + +%else + ; Don't read the final data during partial block processing + %ifdef NT_LD + %if (i<7) + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %else + %if (i<7) + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + %if (i<7) + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + ;; Do not read the data since it could fault + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %else + vaesenclast reg(j), reg(j), %%T2 + %endif + %endif +%endif + +%assign i (i+1) +%assign j (j+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T7, %%T3 + vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + %ifidn %%ENC_DEC, ENC + ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 + %ifidn %%FULL_PARTIAL, full + ;; Avoid writing past the buffer if handling a partial block + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 + %endif + %endif + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + + vpxor %%XMM1, %%T1 + + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_8] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey] + vpshufd %%T2, %%XMM8, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM8 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_7 15 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_7] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_1] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + + +;;; Handle encryption of the final partial block +;;; IN: +;;; r13 - Number of bytes to read +;;; MODIFIES: +;;; KEY - Key for encrypting the partial block +;;; HASH - Current hash value +;;; SMASHES: +;;; r10, r12, r15, rax +;;; T1, T2 +;;; Note: +;;; PLAIN_CYPH_LEN, %7, is passed only to determine +;;; if buffer is big enough to do a 16 byte read & shift. +;;; 'LT16' is passed here only if buffer is known to be smaller +;;; than 16 bytes. +;;; Any other value passed here will result in 16 byte read +;;; code path. +;;; TBD: Remove HASH from the instantiation +%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 +%define %%KEY %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%CYPH_PLAIN_OUT %4 +%define %%PLAIN_CYPH_IN %5 +%define %%PLAIN_CYPH_LEN %6 +%define %%ENC_DEC %7 +%define %%DATA_OFFSET %8 + + ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting +%ifidn %%PLAIN_CYPH_LEN, LT16 + ;; Handle the case where the message is < 16 bytes + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + + ;; T1 - packed output + ;; r10 - input data address + ;; r13 - input data length + ;; r12, r15, rax - temp registers + READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 +%else + ;; Handle the case where the message is >= 16 bytes + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + ;; Receive the last <16 Byte block + vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes + ;; (r13 is the number of bytes in plaintext mod 16) + sub r12, r13 + ;; Get the appropriate shuffle mask + vmovdqu %%T2, [r12] + ;; shift right 16-r13 bytes + vpshufb %%T1, %%T2 +%endif ; %%PLAIN_CYPH_LEN, LT16 + + ;; At this point T1 contains the partial block data +%ifidn %%ENC_DEC, DEC + ;; Plaintext XOR E(K, Yn) + ;; Set aside the ciphertext + vmovdqa %%T2, %%T1 + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of ciphertext + vpand %%KEY, %%KEY, %%T1 + + ;; Prepare the ciphertext for the hash + ;; mask out top 16-r13 bytes of the plaintext + vpand %%T2, %%T2, %%T1 +%else + ;; Plaintext XOR E(K, Yn) + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of %%KEY + vpand %%KEY, %%KEY, %%T1 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Output r13 Bytes + vmovq rax, %%KEY + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq %%T1, %%KEY, 8 + vmovq rax, %%T1 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn %%ENC_DEC, DEC + ;; If decrypt, restore the ciphertext into %%KEY + vmovdqu %%KEY, %%T2 +%endif +%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK + + + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm14 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + mov r10, %%A_LEN + cmp r10, 0 + je %%_aad_is_zero + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + jmp %%_after_aad + +%%_aad_is_zero: + vpxor %%AAD_HASH, %%AAD_HASH + +%%_after_aad: + mov r10, %%A_LEN + vpxor xmm2, xmm3 + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + +%macro GCM_ENC_DEC_SMALL 12 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET %7 +%define %%LENGTH %8 +%define %%NUM_BLOCKS %9 +%define %%CTR %10 +%define %%HASH %11 +%define %%INSTANCE_TYPE %12 + + ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. + ;; cmp %%NUM_BLOCKS, 0 + ;; je %%_small_initial_blocks_encrypted + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + cmp %%NUM_BLOCKS, 5 + je %%_small_initial_num_blocks_is_5 + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + jmp %%_small_initial_num_blocks_is_1 + + +%%_small_initial_num_blocks_is_8: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_6: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_5: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_4: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_3: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_2: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_1: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + + ;; Note: zero initial blocks not allowed. + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%INSTANCE_TYPE %7 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + ;; Update length of data processed + add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC +%endif + + ;; lift CTR set from initial_blocks to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu xmm9, xmm2 +%else + vmovdqu xmm9, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in r10 + mov r13, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + sub r13, %%DATA_OFFSET + + ;; There may be no more data if it was consumed in the partial block. + cmp r13, 0 + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + mov r10, r13 + + ;; Determine how many blocks to process in INITIAL + mov r12, r13 + shr r12, 4 + and r12, 7 + + ;; Process one additional block in INITIAL if there is a partial block + and r10, 0xf + blsmsk r10, r10 ; Set CF if zero + cmc ; Flip CF + adc r12, 0x0 ; Process an additional INITIAL block if CF set + + ;; Less than 127B will be handled by the small message code, which + ;; can process up to 7 16B blocks. + cmp r13, 128 + jge %%_large_message_path + + GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE + jmp %%_ghash_done + +%%_large_message_path: + and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will + ; can be handled by the x8 partial loop. + + cmp r12, 0 + je %%_initial_num_blocks_is_0 + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + ;; The entire message was encrypted processed in initial and now need to be hashed + cmp r13, 0 + je %%_encrypt_done + + ;; Encrypt the final <16 byte (partial) block, then hash + cmp r13, 16 + jl %%_encrypt_final_partial + + ;; Process 7 full blocks plus a partial block + cmp r13, 128 + jl %%_encrypt_by_8_partial + + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter without shuffling + ;; it back into little endian. r15d keeps track of when we need to increent in order so + ;; that the carry is handled correctly. + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [rel SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + ;; xmm0 - T1 + ;; xmm10 - T2 + ;; xmm11 - T3 + ;; xmm12 - T4 + ;; xmm13 - T5 + ;; xmm14 - T6 + ;; xmm9 - CTR + ;; xmm1 - XMM1 + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 + ;; xmm15 - T7 + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_encrypt_by_8_parallel_done + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_parallel_done: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + cmp r13, 0 + je %%_encrypt_done + +%%_encrypt_by_8_partial: + ;; Shuffle needed to align key for partial block xor. out_order + ;; is a little faster because it avoids extra shuffles. + ;; TBD: Might need to account for when we don't have room to increment the counter. + + + ;; Process parallel buffers with a final partial block. + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial + + + add %%DATA_OFFSET, 128-16 + sub r13, 128-16 + +%%_encrypt_final_partial: + + vpshufb xmm8, [SHUF_MASK] + mov [%%GDATA_CTX + PBlockLen], r13 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 + + ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext + ;; GDATA, KEY, T1, T2 + ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET + + vpshufb xmm8, [SHUF_MASK] + + +%%_encrypt_done: + + ;; Mapping to macro parameters + ;; IN: + ;; xmm9 contains the counter + ;; xmm1-xmm8 contain the xor'd ciphertext + ;; OUT: + ;; xmm14 contains the final hash + ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 +%ifidn %%INSTANCE_TYPE, multi_call + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + jz %%_hash_last_8 + GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + ;; XOR the partial word into the hash + vpxor xmm14, xmm14, xmm8 + jmp %%_ghash_done +%endif +%%_hash_last_8: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + +%%_ghash_done: + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 + +%%_enc_dec_done: + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen4 / +; aes_gcm_precomp_192_avx_gen4 / +; aes_gcm_precomp_256_avx_gen4 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, xmm2, [POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + vmovdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / +; aes_gcm_enc_128_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / +; aes_gcm_dec_256_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / +; aes_gcm_enc_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 +; aes_gcm_dec_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm new file mode 100644 index 000000000..e823b7959 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm @@ -0,0 +1,291 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef GCM_DEFINES_ASM_INCLUDED +%define GCM_DEFINES_ASM_INCLUDED + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford + + +;;;;;; + +section .data + +align 16 + +POLY dq 0x0000000000000001, 0xC200000000000000 + +align 64 +POLY2 dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 +align 16 +TWOONE dq 0x0000000000000001, 0x0000000100000000 + +; order of these constants should not change. +; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +align 64 +SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 +ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff +ZERO dq 0x0000000000000000, 0x0000000000000000 +ONE dq 0x0000000000000001, 0x0000000000000000 +TWO dq 0x0000000000000002, 0x0000000000000000 +ONEf dq 0x0000000000000000, 0x0100000000000000 +TWOf dq 0x0000000000000000, 0x0200000000000000 + +align 64 +ddq_add_1234: + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_5678: + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_add_4444: + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_8888: + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_addbe_1234: + dq 0x0000000000000000, 0x0100000000000000 + dq 0x0000000000000000, 0x0200000000000000 + dq 0x0000000000000000, 0x0300000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_5678: + dq 0x0000000000000000, 0x0500000000000000 + dq 0x0000000000000000, 0x0600000000000000 + dq 0x0000000000000000, 0x0700000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +ddq_addbe_4444: + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_8888: + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +byte_len_to_mask_table: + dw 0x0000, 0x0001, 0x0003, 0x0007, + dw 0x000f, 0x001f, 0x003f, 0x007f, + dw 0x00ff, 0x01ff, 0x03ff, 0x07ff, + dw 0x0fff, 0x1fff, 0x3fff, 0x7fff, + dw 0xffff + +align 64 +byte64_len_to_mask_table: + dq 0x0000000000000000, 0x0000000000000001 + dq 0x0000000000000003, 0x0000000000000007 + dq 0x000000000000000f, 0x000000000000001f + dq 0x000000000000003f, 0x000000000000007f + dq 0x00000000000000ff, 0x00000000000001ff + dq 0x00000000000003ff, 0x00000000000007ff + dq 0x0000000000000fff, 0x0000000000001fff + dq 0x0000000000003fff, 0x0000000000007fff + dq 0x000000000000ffff, 0x000000000001ffff + dq 0x000000000003ffff, 0x000000000007ffff + dq 0x00000000000fffff, 0x00000000001fffff + dq 0x00000000003fffff, 0x00000000007fffff + dq 0x0000000000ffffff, 0x0000000001ffffff + dq 0x0000000003ffffff, 0x0000000007ffffff + dq 0x000000000fffffff, 0x000000001fffffff + dq 0x000000003fffffff, 0x000000007fffffff + dq 0x00000000ffffffff, 0x00000001ffffffff + dq 0x00000003ffffffff, 0x00000007ffffffff + dq 0x0000000fffffffff, 0x0000001fffffffff + dq 0x0000003fffffffff, 0x0000007fffffffff + dq 0x000000ffffffffff, 0x000001ffffffffff + dq 0x000003ffffffffff, 0x000007ffffffffff + dq 0x00000fffffffffff, 0x00001fffffffffff + dq 0x00003fffffffffff, 0x00007fffffffffff + dq 0x0000ffffffffffff, 0x0001ffffffffffff + dq 0x0003ffffffffffff, 0x0007ffffffffffff + dq 0x000fffffffffffff, 0x001fffffffffffff + dq 0x003fffffffffffff, 0x007fffffffffffff + dq 0x00ffffffffffffff, 0x01ffffffffffffff + dq 0x03ffffffffffffff, 0x07ffffffffffffff + dq 0x0fffffffffffffff, 0x1fffffffffffffff + dq 0x3fffffffffffffff, 0x7fffffffffffffff + dq 0xffffffffffffffff + +align 64 +mask_out_top_block: + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0x0000000000000000, 0x0000000000000000 + +section .text + + +;;define the fields of gcm_data struct +;typedef struct gcm_data +;{ +; u8 expanded_keys[16*15]; +; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here +; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here +; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here +; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here +; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here +; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here +; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here +; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here +; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +;} gcm_data; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define HashKey 16*15 ; store HashKey <<1 mod poly here +%define HashKey_1 16*15 ; store HashKey <<1 mod poly here +%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here +%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here +%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here +%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here +%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here +%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here +%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here +%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +%endif + +%define AadHash 16*0 ; store current Hash of data which has been input +%define AadLen 16*1 ; store length of input data which will not be encrypted or decrypted +%define InLen (16*1)+8 ; store length of input data which will be encrypted or decrypted +%define PBlockEncKey 16*2 ; encryption key for the partial block at the end of the previous update +%define OrigIV 16*3 ; input IV +%define CurCount 16*4 ; Current counter for generation of encryption key +%define PBlockLen 16*5 ; length of partial block at the end of the previous update + +%define reg(q) xmm %+ q +%define arg(x) [r14 + STACK_OFFSET + 8*x] + + + + +%ifnidn __OUTPUT_FORMAT__, elf64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 + %xdefine arg4 r9 + %xdefine arg5 rsi ;[r14 + STACK_OFFSET + 8*5] - need push and load + %xdefine arg6 [r14 + STACK_OFFSET + 8*6] + %xdefine arg7 [r14 + STACK_OFFSET + 8*7] + %xdefine arg8 [r14 + STACK_OFFSET + 8*8] + %xdefine arg9 [r14 + STACK_OFFSET + 8*9] + %xdefine arg10 [r14 + STACK_OFFSET + 8*10] + +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx + %xdefine arg4 rcx + %xdefine arg5 r8 + %xdefine arg6 r9 + %xdefine arg7 [r14 + STACK_OFFSET + 8*1] + %xdefine arg8 [r14 + STACK_OFFSET + 8*2] + %xdefine arg9 [r14 + STACK_OFFSET + 8*3] + %xdefine arg10 [r14 + STACK_OFFSET + 8*4] +%endif + +%ifdef NT_LDST + %define NT_LD + %define NT_ST +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_LD + %define XLDR movntdqa + %define VXLDR vmovntdqa + %define VX512LDR vmovntdqa +%else + %define XLDR movdqu + %define VXLDR vmovdqu + %define VX512LDR vmovdqu8 +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_ST + %define XSTR movntdq + %define VXSTR vmovntdq + %define VX512STR vmovntdq +%else + %define XSTR movdqu + %define VXSTR vmovdqu + %define VX512STR vmovdqu8 +%endif + +%endif ; GCM_DEFINES_ASM_INCLUDED diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm new file mode 100644 index 000000000..fd8aa05a6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm @@ -0,0 +1,233 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define GCM_KEYS_VAES_AVX512_INCLUDED + +;; Define the fields of gcm_key_data struct: +;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; +;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)]; +;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly +;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly +;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly +;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly +;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly +;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly +;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly +;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly + +%ifdef GCM_BIG_DATA +;; +;; Key structure holds up to 128 ghash keys +;; +%define HashKey_128 (16*15) ; HashKey^128 <<1 mod poly +%define HashKey_127 (16*16) ; HashKey^127 <<1 mod poly +%define HashKey_126 (16*17) ; HashKey^126 <<1 mod poly +%define HashKey_125 (16*18) ; HashKey^125 <<1 mod poly +%define HashKey_124 (16*19) ; HashKey^124 <<1 mod poly +%define HashKey_123 (16*20) ; HashKey^123 <<1 mod poly +%define HashKey_122 (16*21) ; HashKey^122 <<1 mod poly +%define HashKey_121 (16*22) ; HashKey^121 <<1 mod poly +%define HashKey_120 (16*23) ; HashKey^120 <<1 mod poly +%define HashKey_119 (16*24) ; HashKey^119 <<1 mod poly +%define HashKey_118 (16*25) ; HashKey^118 <<1 mod poly +%define HashKey_117 (16*26) ; HashKey^117 <<1 mod poly +%define HashKey_116 (16*27) ; HashKey^116 <<1 mod poly +%define HashKey_115 (16*28) ; HashKey^115 <<1 mod poly +%define HashKey_114 (16*29) ; HashKey^114 <<1 mod poly +%define HashKey_113 (16*30) ; HashKey^113 <<1 mod poly +%define HashKey_112 (16*31) ; HashKey^112 <<1 mod poly +%define HashKey_111 (16*32) ; HashKey^111 <<1 mod poly +%define HashKey_110 (16*33) ; HashKey^110 <<1 mod poly +%define HashKey_109 (16*34) ; HashKey^109 <<1 mod poly +%define HashKey_108 (16*35) ; HashKey^108 <<1 mod poly +%define HashKey_107 (16*36) ; HashKey^107 <<1 mod poly +%define HashKey_106 (16*37) ; HashKey^106 <<1 mod poly +%define HashKey_105 (16*38) ; HashKey^105 <<1 mod poly +%define HashKey_104 (16*39) ; HashKey^104 <<1 mod poly +%define HashKey_103 (16*40) ; HashKey^103 <<1 mod poly +%define HashKey_102 (16*41) ; HashKey^102 <<1 mod poly +%define HashKey_101 (16*42) ; HashKey^101 <<1 mod poly +%define HashKey_100 (16*43) ; HashKey^100 <<1 mod poly +%define HashKey_99 (16*44) ; HashKey^99 <<1 mod poly +%define HashKey_98 (16*45) ; HashKey^98 <<1 mod poly +%define HashKey_97 (16*46) ; HashKey^97 <<1 mod poly +%define HashKey_96 (16*47) ; HashKey^96 <<1 mod poly +%define HashKey_95 (16*48) ; HashKey^95 <<1 mod poly +%define HashKey_94 (16*49) ; HashKey^94 <<1 mod poly +%define HashKey_93 (16*50) ; HashKey^93 <<1 mod poly +%define HashKey_92 (16*51) ; HashKey^92 <<1 mod poly +%define HashKey_91 (16*52) ; HashKey^91 <<1 mod poly +%define HashKey_90 (16*53) ; HashKey^90 <<1 mod poly +%define HashKey_89 (16*54) ; HashKey^89 <<1 mod poly +%define HashKey_88 (16*55) ; HashKey^88 <<1 mod poly +%define HashKey_87 (16*56) ; HashKey^87 <<1 mod poly +%define HashKey_86 (16*57) ; HashKey^86 <<1 mod poly +%define HashKey_85 (16*58) ; HashKey^85 <<1 mod poly +%define HashKey_84 (16*59) ; HashKey^84 <<1 mod poly +%define HashKey_83 (16*60) ; HashKey^83 <<1 mod poly +%define HashKey_82 (16*61) ; HashKey^82 <<1 mod poly +%define HashKey_81 (16*62) ; HashKey^81 <<1 mod poly +%define HashKey_80 (16*63) ; HashKey^80 <<1 mod poly +%define HashKey_79 (16*64) ; HashKey^79 <<1 mod poly +%define HashKey_78 (16*65) ; HashKey^78 <<1 mod poly +%define HashKey_77 (16*66) ; HashKey^77 <<1 mod poly +%define HashKey_76 (16*67) ; HashKey^76 <<1 mod poly +%define HashKey_75 (16*68) ; HashKey^75 <<1 mod poly +%define HashKey_74 (16*69) ; HashKey^74 <<1 mod poly +%define HashKey_73 (16*70) ; HashKey^73 <<1 mod poly +%define HashKey_72 (16*71) ; HashKey^72 <<1 mod poly +%define HashKey_71 (16*72) ; HashKey^71 <<1 mod poly +%define HashKey_70 (16*73) ; HashKey^70 <<1 mod poly +%define HashKey_69 (16*74) ; HashKey^69 <<1 mod poly +%define HashKey_68 (16*75) ; HashKey^68 <<1 mod poly +%define HashKey_67 (16*76) ; HashKey^67 <<1 mod poly +%define HashKey_66 (16*77) ; HashKey^66 <<1 mod poly +%define HashKey_65 (16*78) ; HashKey^65 <<1 mod poly +%define HashKey_64 (16*79) ; HashKey^64 <<1 mod poly +%define HashKey_63 (16*80) ; HashKey^63 <<1 mod poly +%define HashKey_62 (16*81) ; HashKey^62 <<1 mod poly +%define HashKey_61 (16*82) ; HashKey^61 <<1 mod poly +%define HashKey_60 (16*83) ; HashKey^60 <<1 mod poly +%define HashKey_59 (16*84) ; HashKey^59 <<1 mod poly +%define HashKey_58 (16*85) ; HashKey^58 <<1 mod poly +%define HashKey_57 (16*86) ; HashKey^57 <<1 mod poly +%define HashKey_56 (16*87) ; HashKey^56 <<1 mod poly +%define HashKey_55 (16*88) ; HashKey^55 <<1 mod poly +%define HashKey_54 (16*89) ; HashKey^54 <<1 mod poly +%define HashKey_53 (16*90) ; HashKey^53 <<1 mod poly +%define HashKey_52 (16*91) ; HashKey^52 <<1 mod poly +%define HashKey_51 (16*92) ; HashKey^51 <<1 mod poly +%define HashKey_50 (16*93) ; HashKey^50 <<1 mod poly +%define HashKey_49 (16*94) ; HashKey^49 <<1 mod poly +%define HashKey_48 (16*95) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*96) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*97) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*98) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*99) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*100) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*101) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*102) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*103) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*104) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*105) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*106) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*107) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*108) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*109) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*110) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*111) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*112) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*113) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*114) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*115) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*116) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*117) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*118) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*119) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*120) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*121) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*122) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*123) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*124) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*125) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*126) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*127) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*128) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*129) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*130) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*131) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*132) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*133) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*134) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*135) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*136) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*137) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*138) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*139) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*140) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*141) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*142) ; HashKey <<1 mod poly +%define HashKey (16*142) ; HashKey <<1 mod poly +%else +;; +;; Key structure holds up to 48 ghash keys +;; +%define HashKey_48 (16*15) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*16) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*17) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*18) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*19) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*20) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*21) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*22) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*23) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*24) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*25) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*26) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*27) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*28) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*29) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*30) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*31) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*32) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*33) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*34) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*35) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*36) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*37) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*38) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*39) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*40) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*41) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*42) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*43) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*44) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*45) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*46) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*47) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*48) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*49) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*50) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*51) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*52) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*53) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*54) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*55) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*56) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*57) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*58) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*59) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*60) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*61) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*62) ; HashKey <<1 mod poly +%define HashKey (16*62) ; HashKey <<1 mod poly +%endif ; !GCM_BIG_DATA + +%endif ; GCM_KEYS_VAES_AVX512_INCLUDED diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm new file mode 100644 index 000000000..6f71e43fa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm @@ -0,0 +1,184 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_gcm_init_128_sse +extern aes_gcm_init_128_avx_gen4 +extern aes_gcm_init_128_avx_gen2 + +extern aes_gcm_enc_128_sse +extern aes_gcm_enc_128_avx_gen4 +extern aes_gcm_enc_128_avx_gen2 +extern aes_gcm_enc_128_update_sse +extern aes_gcm_enc_128_update_avx_gen4 +extern aes_gcm_enc_128_update_avx_gen2 +extern aes_gcm_enc_128_finalize_sse +extern aes_gcm_enc_128_finalize_avx_gen4 +extern aes_gcm_enc_128_finalize_avx_gen2 + +extern aes_gcm_dec_128_sse +extern aes_gcm_dec_128_avx_gen4 +extern aes_gcm_dec_128_avx_gen2 +extern aes_gcm_dec_128_update_sse +extern aes_gcm_dec_128_update_avx_gen4 +extern aes_gcm_dec_128_update_avx_gen2 +extern aes_gcm_dec_128_finalize_sse +extern aes_gcm_dec_128_finalize_avx_gen4 +extern aes_gcm_dec_128_finalize_avx_gen2 + +extern aes_gcm_precomp_128_sse +extern aes_gcm_precomp_128_avx_gen4 +extern aes_gcm_precomp_128_avx_gen2 + +extern aes_gcm_init_256_sse +extern aes_gcm_init_256_avx_gen4 +extern aes_gcm_init_256_avx_gen2 + +extern aes_gcm_enc_256_sse +extern aes_gcm_enc_256_avx_gen4 +extern aes_gcm_enc_256_avx_gen2 +extern aes_gcm_enc_256_update_sse +extern aes_gcm_enc_256_update_avx_gen4 +extern aes_gcm_enc_256_update_avx_gen2 +extern aes_gcm_enc_256_finalize_sse +extern aes_gcm_enc_256_finalize_avx_gen4 +extern aes_gcm_enc_256_finalize_avx_gen2 + +extern aes_gcm_dec_256_sse +extern aes_gcm_dec_256_avx_gen4 +extern aes_gcm_dec_256_avx_gen2 +extern aes_gcm_dec_256_update_sse +extern aes_gcm_dec_256_update_avx_gen4 +extern aes_gcm_dec_256_update_avx_gen2 +extern aes_gcm_dec_256_finalize_sse +extern aes_gcm_dec_256_finalize_avx_gen4 +extern aes_gcm_dec_256_finalize_avx_gen2 + +extern aes_gcm_precomp_256_sse +extern aes_gcm_precomp_256_avx_gen4 +extern aes_gcm_precomp_256_avx_gen2 + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_gcm_precomp_128_vaes_avx512 +extern aes_gcm_init_128_vaes_avx512 +extern aes_gcm_enc_128_update_vaes_avx512 +extern aes_gcm_dec_128_update_vaes_avx512 +extern aes_gcm_enc_128_finalize_vaes_avx512 +extern aes_gcm_dec_128_finalize_vaes_avx512 +extern aes_gcm_enc_128_vaes_avx512 +extern aes_gcm_dec_128_vaes_avx512 + +extern aes_gcm_precomp_256_vaes_avx512 +extern aes_gcm_init_256_vaes_avx512 +extern aes_gcm_enc_256_update_vaes_avx512 +extern aes_gcm_dec_256_update_vaes_avx512 +extern aes_gcm_enc_256_finalize_vaes_avx512 +extern aes_gcm_dec_256_finalize_vaes_avx512 +extern aes_gcm_enc_256_vaes_avx512 +extern aes_gcm_dec_256_vaes_avx512 +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_init_128 +mbin_dispatch_init7 aes_gcm_init_128, aes_gcm_init_128_sse, aes_gcm_init_128_sse, aes_gcm_init_128_avx_gen2, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_vaes_avx512 + +mbin_interface aes_gcm_enc_128 +mbin_dispatch_init7 aes_gcm_enc_128, aes_gcm_enc_128_sse, aes_gcm_enc_128_sse, aes_gcm_enc_128_avx_gen2, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_vaes_avx512 + +mbin_interface aes_gcm_enc_128_update +mbin_dispatch_init7 aes_gcm_enc_128_update, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_avx_gen2, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_vaes_avx512 + +mbin_interface aes_gcm_enc_128_finalize +mbin_dispatch_init7 aes_gcm_enc_128_finalize, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_avx_gen2, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_vaes_avx512 + +mbin_interface aes_gcm_dec_128 +mbin_dispatch_init7 aes_gcm_dec_128, aes_gcm_dec_128_sse, aes_gcm_dec_128_sse, aes_gcm_dec_128_avx_gen2, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_vaes_avx512 + +mbin_interface aes_gcm_dec_128_update +mbin_dispatch_init7 aes_gcm_dec_128_update, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_avx_gen2, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_vaes_avx512 + +mbin_interface aes_gcm_dec_128_finalize +mbin_dispatch_init7 aes_gcm_dec_128_finalize, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_avx_gen2, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_vaes_avx512 + +mbin_interface aes_gcm_precomp_128 +mbin_dispatch_init7 aes_gcm_precomp_128, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_avx_gen2, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_vaes_avx512 + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_init_256 +mbin_dispatch_init7 aes_gcm_init_256, aes_gcm_init_256_sse, aes_gcm_init_256_sse, aes_gcm_init_256_avx_gen2, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_vaes_avx512 + +mbin_interface aes_gcm_enc_256 +mbin_dispatch_init7 aes_gcm_enc_256, aes_gcm_enc_256_sse, aes_gcm_enc_256_sse, aes_gcm_enc_256_avx_gen2, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_vaes_avx512 + +mbin_interface aes_gcm_enc_256_update +mbin_dispatch_init7 aes_gcm_enc_256_update, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_avx_gen2, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_vaes_avx512 + +mbin_interface aes_gcm_enc_256_finalize +mbin_dispatch_init7 aes_gcm_enc_256_finalize, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_avx_gen2, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_vaes_avx512 + +mbin_interface aes_gcm_dec_256 +mbin_dispatch_init7 aes_gcm_dec_256, aes_gcm_dec_256_sse, aes_gcm_dec_256_sse, aes_gcm_dec_256_avx_gen2, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_vaes_avx512 + +mbin_interface aes_gcm_dec_256_update +mbin_dispatch_init7 aes_gcm_dec_256_update, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_avx_gen2, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_vaes_avx512 + +mbin_interface aes_gcm_dec_256_finalize +mbin_dispatch_init7 aes_gcm_dec_256_finalize, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_avx_gen2, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_vaes_avx512 + +mbin_interface aes_gcm_precomp_256 +mbin_dispatch_init7 aes_gcm_precomp_256, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_avx_gen2, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_vaes_avx512 + + +;;; func core, ver, snum +slversion aes_gcm_enc_128, 00, 00, 02c0 +slversion aes_gcm_dec_128, 00, 00, 02c1 +slversion aes_gcm_init_128, 00, 00, 02c2 +slversion aes_gcm_enc_128_update, 00, 00, 02c3 +slversion aes_gcm_dec_128_update, 00, 00, 02c4 +slversion aes_gcm_enc_128_finalize, 00, 00, 02c5 +slversion aes_gcm_dec_128_finalize, 00, 00, 02c6 +slversion aes_gcm_enc_256, 00, 00, 02d0 +slversion aes_gcm_dec_256, 00, 00, 02d1 +slversion aes_gcm_init_256, 00, 00, 02d2 +slversion aes_gcm_enc_256_update, 00, 00, 02d3 +slversion aes_gcm_dec_256_update, 00, 00, 02d4 +slversion aes_gcm_enc_256_finalize, 00, 00, 02d5 +slversion aes_gcm_dec_256_finalize, 00, 00, 02d6 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm new file mode 100644 index 000000000..4c5083173 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm @@ -0,0 +1,118 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_gcm_enc_128_sse_nt +extern aes_gcm_enc_128_avx_gen4_nt +extern aes_gcm_enc_128_avx_gen2_nt +extern aes_gcm_enc_128_update_sse_nt +extern aes_gcm_enc_128_update_avx_gen4_nt +extern aes_gcm_enc_128_update_avx_gen2_nt + +extern aes_gcm_dec_128_sse_nt +extern aes_gcm_dec_128_avx_gen4_nt +extern aes_gcm_dec_128_avx_gen2_nt +extern aes_gcm_dec_128_update_sse_nt +extern aes_gcm_dec_128_update_avx_gen4_nt +extern aes_gcm_dec_128_update_avx_gen2_nt + +extern aes_gcm_enc_256_sse_nt +extern aes_gcm_enc_256_avx_gen4_nt +extern aes_gcm_enc_256_avx_gen2_nt +extern aes_gcm_enc_256_update_sse_nt +extern aes_gcm_enc_256_update_avx_gen4_nt +extern aes_gcm_enc_256_update_avx_gen2_nt + +extern aes_gcm_dec_256_sse_nt +extern aes_gcm_dec_256_avx_gen4_nt +extern aes_gcm_dec_256_avx_gen2_nt +extern aes_gcm_dec_256_update_sse_nt +extern aes_gcm_dec_256_update_avx_gen4_nt +extern aes_gcm_dec_256_update_avx_gen2_nt + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_gcm_enc_128_update_vaes_avx512_nt +extern aes_gcm_dec_128_update_vaes_avx512_nt +extern aes_gcm_enc_128_vaes_avx512_nt +extern aes_gcm_dec_128_vaes_avx512_nt + +extern aes_gcm_enc_256_update_vaes_avx512_nt +extern aes_gcm_dec_256_update_vaes_avx512_nt +extern aes_gcm_enc_256_vaes_avx512_nt +extern aes_gcm_dec_256_vaes_avx512_nt +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate aes_gcm NT interfaces enc, enc_update, dec, dec_update +;;;; +mbin_interface aes_gcm_enc_128_nt +mbin_dispatch_init7 aes_gcm_enc_128_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_avx_gen2_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_vaes_avx512_nt + +mbin_interface aes_gcm_enc_128_update_nt +mbin_dispatch_init7 aes_gcm_enc_128_update_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_avx_gen2_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_vaes_avx512_nt + +mbin_interface aes_gcm_dec_128_nt +mbin_dispatch_init7 aes_gcm_dec_128_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_avx_gen2_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_vaes_avx512_nt + +mbin_interface aes_gcm_dec_128_update_nt +mbin_dispatch_init7 aes_gcm_dec_128_update_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_avx_gen2_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_vaes_avx512_nt + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_enc_256_nt +mbin_dispatch_init7 aes_gcm_enc_256_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_avx_gen2_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_vaes_avx512_nt + +mbin_interface aes_gcm_enc_256_update_nt +mbin_dispatch_init7 aes_gcm_enc_256_update_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_avx_gen2_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_vaes_avx512_nt + +mbin_interface aes_gcm_dec_256_nt +mbin_dispatch_init7 aes_gcm_dec_256_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_avx_gen2_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_vaes_avx512_nt + +mbin_interface aes_gcm_dec_256_update_nt +mbin_dispatch_init7 aes_gcm_dec_256_update_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_avx_gen2_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_vaes_avx512_nt + + +;;; func core, ver, snum +slversion aes_gcm_enc_128_nt, 00, 00, 02e1 +slversion aes_gcm_dec_128_nt, 00, 00, 02e2 +slversion aes_gcm_enc_128_update_nt, 00, 00, 02e3 +slversion aes_gcm_dec_128_update_nt, 00, 00, 02e4 +slversion aes_gcm_enc_256_nt, 00, 00, 02e5 +slversion aes_gcm_dec_256_nt, 00, 00, 02e6 +slversion aes_gcm_enc_256_update_nt, 00, 00, 02e7 +slversion aes_gcm_dec_256_update_nt, 00, 00, 02e8 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c new file mode 100644 index 000000000..529d36b31 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c @@ -0,0 +1,2038 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include // for memcmp +#include +#include +#include "gcm_vectors.h" +#include "ossl_helper.h" +#include "types.h" + +//#define GCM_VECTORS_VERBOSE +//#define GCM_VECTORS_EXTRA_VERBOSE +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 200 +#endif +#ifndef TEST_LEN +# define TEST_LEN 32*1024 +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif + +// NT versions require 64B alignment +# define NT_ALIGNMENT (64) +# define ALIGNMENT_MASK (~(NT_ALIGNMENT - 1)) +# define OFFSET_BASE_VALUE (NT_ALIGNMENT) +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (1) +#endif + +void dump_table(char *title, uint8_t * table, uint8_t count) +{ + int i; + char const *space = " "; + + printf("%s%s => {\n", space, title); + for (i = 0; i < count; i++) { + if (0 == (i & 15)) + printf("%s%s", space, space); + printf("%2x, ", table[i]); + if (15 == (i & 15)) + printf("\n"); + + } + printf("%s}\n", space); +} + +void dump_gcm_data(struct gcm_key_data *gkey) +{ +#ifdef GCM_VECTORS_EXTRA_VERBOSE + printf("gcm_data {\n"); + dump_table("expanded_keys", gkey->expanded_keys, (16 * 11)); + dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16); + dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16); + dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16); + dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16); + dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16); + dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16); + dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16); + dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16); + dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16); + dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16); + dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16); + dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16); + dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16); + dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16); + dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16); + dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16); + printf("}\n"); +#endif //GCM_VECTORS_VERBOSE +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128_nt(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i, ret; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, (i - last_break)); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (rand() % (test_len / 64) == 0) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + free(rand_data); + + return OK; +} + +int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int length, int start, int breaks) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = length; + uint8_t *rand_data = NULL; + int ret; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + ret = posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = length; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(rand_data); + if (vector->Plen != 0) { + free(pt_test); + free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = 1; + uint8_t *rand_data = NULL; + uint64_t length; + int ret; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + ret = posix_memalign((void **)&stream, 64, PAGE_LEN); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break, + i - last_break); + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + ret = posix_memalign((void **)&stream, 64, PAGE_LEN); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break, + i - last_break); + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + free(rand_data); + + return OK; +} + +int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_256_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256_nt(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i, ret; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + + aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + + aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int test_gcm_strm_efence(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random efence test vectors with random stream:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector_efence(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 64); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = 0; // (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +int test_gcm_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 64); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +// +// place all data to end at a page boundary to check for read past the end +// +int test_gcm_efence(void) +{ + gcm_vector test; + int offset = 0; + gcm_key_size key_len; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + uint8_t *P = NULL, *C = NULL, *K, *IV, *A, *T; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + ret = posix_memalign((void **)&P, 64, PAGE_LEN); + ret |= posix_memalign((void **)&C, 64, PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + A = malloc(PAGE_LEN); + T = malloc(PAGE_LEN); + if ((0 != ret) || (NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) + || (NULL == A) || (NULL == T) || (NULL == gkey) || (NULL == gctx)) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + + test.Plen = PAGE_LEN / 2; + // place buffers to end at page boundary + test.IVlen = GCM_IV_DATA_LEN; + test.Alen = test.Plen; + test.Tlen = MAX_TAG_LEN; + + printf("AES GCM efence test vectors:"); + for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len; + key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) { + test.Klen = key_len; + for (offset = 0; MAX_UNALIGNED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.Plen = (PAGE_LEN / 2) - offset; + test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes + //Place data at end of page + test.P = P + PAGE_LEN - test.Plen; + test.C = C + PAGE_LEN - test.Plen; + test.K = K + PAGE_LEN - test.Klen; + test.IV = IV + PAGE_LEN - test.IVlen; + test.A = A + PAGE_LEN - test.Alen; + test.T = T + PAGE_LEN - test.Tlen; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + if (GCM_128_KEY_LEN == key_len) { + if (0 != check_vector(gkey, gctx, &test)) + return 1; + } else { + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + } + } + } + free(gkey); + free(gctx); + free(P); + free(C); + free(K); + free(IV); + free(A); + free(T); + + printf("\n"); + return 0; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-128:\n"); +#endif + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 64, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 64, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-256:\n"); +#endif + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 64, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 64, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)"); + OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)"); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors:\n"); + for (vect = 0; vect < vectors_cnt; vect++) { +#ifdef GCM_VECTORS_VERBOSE + printf + ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + + if (BITS_128 == gcm_vectors[vect].Klen) { + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + } else { + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + } + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +// The length of the data is set to length. The first stream is from 0 to start. After +// that the data is broken into breaks chunks of equal size (except possibly the last +// one due to divisibility). +int test_gcm_strm_combinations2(int length, int start, int breaks) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors of length %d and stream with %d breaks:", length, + breaks + 1); + for (t = 0; RANDOMS > t; t++) { + int Plen = length; + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); + errors += test_gcm256_combinations(); + errors += test_gcm_combinations(); + errors += test_gcm_efence(); + errors += test_gcm256_strm_combinations(TEST_LEN); + errors += test_gcm_strm_combinations(TEST_LEN); + errors += test_gcm256_strm_combinations(1024); + errors += test_gcm_strm_combinations(1024); + errors += test_gcm_strm_efence(); + errors += test_gcm_strm_combinations2(1024, 0, 1024); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c new file mode 100644 index 000000000..19c0cc447 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c @@ -0,0 +1,322 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include // for memcmp +#include +#include "gcm_vectors.h" +#include "types.h" + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int test_gcm128_std_vectors_nt(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int ret; + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 32, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 32, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors_nt(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int ret; + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 32, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 32, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memset(pt_test, 0, vector->Plen); + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors_nt(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors NT:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector NT %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_vectors_nt(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_vectors_nt(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors_nt(); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c new file mode 100644 index 000000000..a9e9c5914 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c @@ -0,0 +1,272 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include +#include +#include "ossl_helper.h" +#include "gcm_vectors.h" + +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define AAD_LENGTH 16 +#define TEST_MEM TEST_LEN + +static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext, + *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD; +static uint8_t key128[GCM_128_KEY_LEN]; +static uint8_t key256[GCM_256_KEY_LEN]; +uint8_t iv_len = 0; + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name) +{ + int mismatch; + int OK = 1; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 0; + printf(" v[%d] expected results don't match %s \t\t", vect, data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +void aes_gcm_perf(void) +{ + struct gcm_key_data gkey, gkey256; + struct gcm_context_data gctx; + int i; + + printf + ("AES GCM performance parameters plain text length:%d; IV length:%d; ADD length:%d \n", + TEST_LEN, GCM_IV_LEN, AAD_LENGTH); + + mk_rand_data(key128, sizeof(key128)); + mk_rand_data(key256, sizeof(key256)); + + // This is only required once for a given key + aes_gcm_pre_128(key128, &gkey); + aes_gcm_pre_256(key256, &gkey256); + + // Preload code cache + aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH, + gcm_tag, MAX_TAG_LEN); + openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN, + plaintext, TEST_LEN, ossl_cyphertext); + check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0, + "ISA-L vs OpenSSL 128 key cypher text (C)"); + check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)"); + aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH, + gcm_tag, MAX_TAG_LEN); + openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN, + plaintext, TEST_LEN, ossl_cyphertext); + check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0, + "ISA-L vs OpenSSL 256 cypher text (C)"); + check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)"); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, + AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + } + + perf_stop(&stop); + printf(" aes_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN, + cyphertext); + } + + perf_stop(&stop); + printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_dec_128(&gkey, &gctx, plaintext, cyphertext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)"); + } + + perf_stop(&stop); + printf(" aes_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN, + plaintext); + } + + perf_stop(&stop); + printf("openssl_aes_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + printf("\n"); + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + } + + perf_stop(&stop); + printf(" aes_gcm256_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN, + cyphertext); + } + + perf_stop(&stop); + printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_dec_256(&gkey256, &gctx, plaintext, cyphertext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, + "ISA-L check of 256 tag (T)"); + } + + perf_stop(&stop); + printf(" aes_gcm256_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN, + plaintext); + } + + perf_stop(&stop); + printf("openssl_aes_256_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } +} + +int main(void) +{ + uint8_t const IVend[] = GCM_IV_END_MARK; + uint32_t OK = 1; + + plaintext = malloc(TEST_LEN); + gcm_plaintext = malloc(TEST_LEN); + cyphertext = malloc(TEST_LEN); + ossl_plaintext = malloc(TEST_LEN + 16); + ossl_cyphertext = malloc(TEST_LEN); + gcm_tag = malloc(MAX_TAG_LEN); + ossl_tag = malloc(MAX_TAG_LEN); + AAD = malloc(AAD_LENGTH); + IV = malloc(GCM_IV_LEN); + if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext) + || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext) + || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) { + printf("malloc of testsize:0x%x failed\n", TEST_LEN); + return -1; + } + + mk_rand_data(plaintext, TEST_LEN); + mk_rand_data(AAD, AAD_LENGTH); + mk_rand_data(IV, GCM_IV_LEN); + memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend)); + iv_len = GCM_IV_LEN - sizeof(IVend); //end marker not part of IV length + + aes_gcm_perf(); + printf("AES gcm ISA-L vs OpenSSL performance\n"); + + return !OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c new file mode 100644 index 000000000..ee064ef6c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c @@ -0,0 +1,61 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include + +void aes_keyexp_128_enc(const void *, uint8_t *); +void aes_gcm_precomp_128(struct gcm_key_data *key_data); +void aes_gcm_precomp_256(struct gcm_key_data *key_data); + +void aes_gcm_pre_128(const void *key, struct gcm_key_data *key_data) +{ + aes_keyexp_128_enc(key, key_data->expanded_keys); + aes_gcm_precomp_128(key_data); +} + +void aes_gcm_pre_256(const void *key, struct gcm_key_data *key_data) +{ + uint8_t tmp_exp_key[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; + aes_keyexp_256((const uint8_t *)key, (uint8_t *) key_data->expanded_keys, tmp_exp_key); + aes_gcm_precomp_256(key_data); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver aes_gcm_pre_128_slver_000002c7; +struct slver aes_gcm_pre_128_slver = { 0x02c7, 0x00, 0x00 }; + +struct slver aes_gcm_pre_256_slver_000002d7; +struct slver aes_gcm_pre_256_slver = { 0x02d7, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c new file mode 100644 index 000000000..4b7ca9736 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c @@ -0,0 +1,78 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "aes_gcm.h" + +#define TXT_SIZE 8 +#define AAD_SIZE 32 +#define TAG_SIZE 16 /* Valid values are 16, 12, or 8 */ +#define KEY_SIZE GCM_256_KEY_LEN +#define IV_SIZE GCM_IV_DATA_LEN + +void mprint(const char *msg, uint8_t * buf, int len) +{ + int i; + printf("%s", msg); + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + printf("\n"); +} + +int main(void) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + uint8_t ct[TXT_SIZE], pt[TXT_SIZE], pt2[TXT_SIZE]; // Cipher text and plain text + uint8_t iv[IV_SIZE], aad[AAD_SIZE], key[KEY_SIZE]; // Key and authentication data + uint8_t tag1[TAG_SIZE], tag2[TAG_SIZE]; // Authentication tags for encode and decode + + printf("gcm example:\n"); + memset(key, 0, KEY_SIZE); + memset(pt, 0, TXT_SIZE); + memset(iv, 0, IV_SIZE); + memset(aad, 0, AAD_SIZE); + + aes_gcm_pre_256(key, &gkey); + aes_gcm_enc_256(&gkey, &gctx, ct, pt, TXT_SIZE, iv, aad, AAD_SIZE, tag1, TAG_SIZE); + aes_gcm_dec_256(&gkey, &gctx, pt2, ct, TXT_SIZE, iv, aad, AAD_SIZE, tag2, TAG_SIZE); + + mprint(" input text: ", pt, TXT_SIZE); + mprint(" cipher text: ", ct, TXT_SIZE); + mprint(" decode text: ", pt2, TXT_SIZE); + mprint(" ath tag1 (enc): ", tag1, TAG_SIZE); + mprint(" ath tag2 (dec): ", tag2, TAG_SIZE); + + return memcmp(tag1, tag2, TAG_SIZE); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm new file mode 100644 index 000000000..e35860496 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm @@ -0,0 +1,2171 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_sse.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 13 +%endif + + +default rel +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqa %%T1, %%GH + pshufd %%T2, %%GH, 01001110b + pshufd %%T3, %%HK, 01001110b + pxor %%T2, %%GH ; %%T2 = (a1+a0) + pxor %%T3, %%HK ; %%T3 = (b1+b0) + + pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T2, %%GH + pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%GH, %%T3 + pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK + + + ;first phase of the reduction + movdqa %%T2, %%GH + movdqa %%T3, %%GH + movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%GH + movdqa %%T4,%%GH + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T5 + pxor %%GH, %%T2 + pxor %%GH, %%T1 ; the result is in %%T1 + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + movdqa %%T4, %%HK + pshufd %%T1, %%HK, 01001110b + pxor %%T1, %%HK + movdqu [%%GDATA + HashKey_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly + movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly + movdqu [%%GDATA + HashKey_3], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_3_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly + movdqu [%%GDATA + HashKey_4], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly + movdqu [%%GDATA + HashKey_5], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_5_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly + movdqu [%%GDATA + HashKey_6], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly + movdqu [%%GDATA + HashKey_7], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly + movdqu [%%GDATA + HashKey_8], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_8_k], %%T1 + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + pxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + pinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + pinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + pxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + movdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + mov r13, [%%GDATA_CTX + PBlockLen] + +%%_data_read: ;Finished reading in data + + + movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key + movdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + movdqa xmm3, xmm1 + pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pand xmm3, xmm1 + pshufb xmm3, [SHUF_MASK] + pshufb xmm3, xmm2 + pxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pshufb xmm9, [SHUF_MASK] + pshufb xmm9, xmm2 + pxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + pshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + + ; start AES for %%num_initial_blocks blocks + movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + paddd %%CTR, [ONE] ; INCR Y0 + movdqa reg(i), %%CTR + pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +movdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + pxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + pxor reg(i), %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + movdqa reg(i), %%T1 + %endif + pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + pxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + movdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM1, %%CTR + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM2, %%CTR + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM3, %%CTR + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM4, %%CTR + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM5, %%CTR + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM6, %%CTR + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM7, %%CTR + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM8, %%CTR + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + movdqu %%T_key, [%%GDATA_KEY+16*0] + pxor %%XMM1, %%T_key + pxor %%XMM2, %%T_key + pxor %%XMM3, %%T_key + pxor %%XMM4, %%T_key + pxor %%XMM5, %%T_key + pxor %%XMM6, %%T_key + pxor %%XMM7, %%T_key + pxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS ; do early (13) rounds (11 for GCM192) + movdqu %%T_key, [%%GDATA_KEY+16*i] + aesenc %%XMM1, %%T_key + aesenc %%XMM2, %%T_key + aesenc %%XMM3, %%T_key + aesenc %%XMM4, %%T_key + aesenc %%XMM5, %%T_key + aesenc %%XMM6, %%T_key + aesenc %%XMM7, %%T_key + aesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round + aesenclast %%XMM1, %%T_key + aesenclast %%XMM2, %%T_key + aesenclast %%XMM3, %%T_key + aesenclast %%XMM4, %%T_key + aesenclast %%XMM5, %%T_key + aesenclast %%XMM6, %%T_key + aesenclast %%XMM7, %%T_key + aesenclast %%XMM8, %%T_key + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + pxor %%XMM1, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM1, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + pxor %%XMM2, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM2, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + pxor %%XMM3, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM3, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + pxor %%XMM4, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM4, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + pxor %%XMM5, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM5, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + pxor %%XMM6, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM6, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + pxor %%XMM7, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM7, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + pxor %%XMM8, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + movdqa %%T7, %%XMM1 + movdqu [rsp + TMP2], %%XMM2 + movdqu [rsp + TMP3], %%XMM3 + movdqu [rsp + TMP4], %%XMM4 + movdqu [rsp + TMP5], %%XMM5 + movdqu [rsp + TMP6], %%XMM6 + movdqu [rsp + TMP7], %%XMM7 + movdqu [rsp + TMP8], %%XMM8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + + movdqa %%T4, %%T7 + pshufd %%T6, %%T7, 01001110b + pxor %%T6, %%T7 + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + %else + paddd %%CTR, [ONEf] ; INCR CNT + %endif + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_8_k] + pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + movdqa %%XMM1, %%CTR + + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM8, %%CTR + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + %else + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM8, %%CTR + %endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu %%T1, [%%GDATA + 16*0] + pxor %%XMM1, %%T1 + pxor %%XMM2, %%T1 + pxor %%XMM3, %%T1 + pxor %%XMM4, %%T1 + pxor %%XMM5, %%T1 + pxor %%XMM6, %%T1 + pxor %%XMM7, %%T1 + pxor %%XMM8, %%T1 + + ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP2] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*1] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + movdqu %%T1, [%%GDATA + 16*2] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqu %%T1, [rsp + TMP3] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*3] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP4] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*4] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*5] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP5] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + + movdqu %%T1, [%%GDATA + 16*6] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + movdqu %%T1, [rsp + TMP6] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*7] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP7] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*8] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP8] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T7, %%T3 + pxor %%T4, %%T1 + + movdqu %%T1, [%%GDATA + 16*9] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + +%ifdef GCM128_MODE + movdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*12] ; finish last key round +%endif +%ifdef GCM256_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*12] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*13] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*14] ; finish last key round +%endif + +%assign i 0 +%assign j 1 +%rep 8 + XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + +%ifidn %%ENC_DEC, DEC + movdqa %%T3, %%T1 +%endif + + pxor %%T1, %%T5 + aesenclast reg(j), %%T1 ; XMM1:XMM8 + XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer + +%ifidn %%ENC_DEC, DEC + movdqa reg(j), %%T3 +%endif +%assign i (i+1) +%assign j (j+1) +%endrep + + + + + pxor %%T2, %%T6 + pxor %%T2, %%T4 + pxor %%T2, %%T7 + + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T3 + pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 + + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T1, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T1 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T1,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T1,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T1 + + pxor %%T2, %%T5 + pxor %%T7, %%T2 + pxor %%T7, %%T4 ; the result is in %%T4 + + + pxor %%XMM1, %%T7 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ; Karatsuba Method + movdqa %%T6, %%XMM1 + pshufd %%T2, %%XMM1, 01001110b + pxor %%T2, %%XMM1 + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 + + pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_8_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + movdqa %%T7, %%XMM1 + movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM2 + pshufd %%T2, %%XMM2, 01001110b + pxor %%T2, %%XMM2 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM2 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM3 + pshufd %%T2, %%XMM3, 01001110b + pxor %%T2, %%XMM3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM3 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM4 + pshufd %%T2, %%XMM4, 01001110b + pxor %%T2, %%XMM4 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM4 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM5 + pshufd %%T2, %%XMM5, 01001110b + pxor %%T2, %%XMM5 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM5 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM6 + pshufd %%T2, %%XMM6, 01001110b + pxor %%T2, %%XMM6 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM6 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM7 + pshufd %%T2, %%XMM7, 01001110b + pxor %%T2, %%XMM7 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM7 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM8 + pshufd %%T2, %%XMM8, 01001110b + pxor %%T2, %%XMM8 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM8 + pxor %%T2, %%XMM1 + pxor %%T2, %%T6 + pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm + + + movdqa %%T4, %%T2 + pslldq %%T4, 8 ; shift-L %%T4 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T4 + pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T1, %%T2 + psrldq %%T1, 4 ; shift-R %%T1 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T4,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T1 + pxor %%T7, %%T2 + pxor %%T6, %%T7 ; the result is in %%T6 + +%endmacro + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 3 +%define %%GDATA %1 +%define %%ST %2 +%define %%T1 %3 + movdqu %%T1, [%%GDATA+16*0] + pxor %%ST, %%T1 +%assign i 1 +%rep NROUNDS + movdqu %%T1, [%%GDATA+16*i] + aesenc %%ST, %%T1 +%assign i (i+1) +%endrep + movdqu %%T1, [%%GDATA+16*i] + aesenclast %%ST, %%T1 +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13 and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + movdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + pxor xmm2, xmm3 + mov r10, %%A_LEN + + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + pinsrq xmm2, [r10], 0 + pinsrd xmm2, [r10+8], 2 + movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + pshufb xmm2, [SHUF_MASK] + + movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data +; struct has been initialized by GCM_INIT. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed + movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + movdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext + sub r13, %%DATA_OFFSET + mov r10, r13 ;save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + movd r15d, xmm9 + and r15d, 255 + pshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + pshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + pshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + movdqu [%%GDATA_CTX + AadHash], xmm14 + movdqu [%%GDATA_CTX + CurCount], xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + paddd xmm9, [ONE] ; INCR CNT to get Yn + movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 + pshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) + movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: + %ifidn %%ENC_DEC, DEC + movdqa xmm2, xmm1 + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pand xmm2, xmm1 + pshufb xmm2, [SHUF_MASK] + pxor xmm14, xmm2 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + %else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pshufb xmm9, [SHUF_MASK] + pxor xmm14, xmm9 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + %endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) + movdqu xmm14, [%%GDATA_CTX + AadHash] + movdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + movdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + movd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + movq xmm1, %%PLAIN_CYPH_LEN + pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + pxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) + + pxor xmm9, xmm14 + + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + movq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + movq rax, xmm9 + mov [r10], rax + psrldq xmm9, 8 + movd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + movdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ;GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + pxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey + + pshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + movdqa xmm2, xmm6 + psllq xmm6, 1 + psrlq xmm2, 63 + movdqa xmm1, xmm2 + pslldq xmm2, 8 + psrldq xmm1, 8 + por xmm6, xmm2 + ;reduction + pshufd xmm2, xmm1, 00100100b + pcmpeqd xmm2, [TWOONE] + pand xmm2, [POLY] + pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + movdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c new file mode 100644 index 000000000..b0a6221d5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c @@ -0,0 +1,1940 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include // for memcmp +#include +#include +#include "gcm_vectors.h" +#include "ossl_helper.h" +#include "types.h" + +//#define GCM_VECTORS_VERBOSE +//#define GCM_VECTORS_EXTRA_VERBOSE +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 200 +#endif +#ifndef TEST_LEN +# define TEST_LEN 32*1024 +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif + +#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST) +# define ALIGNMENT_MASK (~15) +# define OFFSET_BASE_VALUE 16 +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (1) +#endif +#else +# define ALIGNMENT_MASK (~0) +# define OFFSET_BASE_VALUE 1 +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (16) +#endif +#endif + +void dump_table(char *title, uint8_t * table, uint8_t count) +{ + int i; + char const *space = " "; + + printf("%s%s => {\n", space, title); + for (i = 0; i < count; i++) { + if (0 == (i & 15)) + printf("%s%s", space, space); + printf("%2x, ", table[i]); + if (15 == (i & 15)) + printf("\n"); + + } + printf("%s}\n", space); +} + +void dump_gcm_data(struct gcm_key_data *gkey) +{ +#ifdef GCM_VECTORS_EXTRA_VERBOSE + printf("gcm_data {\n"); + dump_table("expanded_keys", gkey->expanded_keys, (16 * 11)); + dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16); + dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16); + dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16); + dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16); + dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16); + dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16); + dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16); + dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16); + dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16); + dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16); + dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16); + dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16); + dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16); + dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16); + dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16); + dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16); + printf("}\n"); +#endif //GCM_VECTORS_VERBOSE +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (rand() % (test_len / 64) == 0) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + free(rand_data); + + return OK; +} + +int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int length, int start, int breaks) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = length; + uint8_t *rand_data = NULL; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = length; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(rand_data); + + return OK; +} + +int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = 1; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + stream = malloc(PAGE_LEN); + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break, + i - last_break); + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + stream = malloc(PAGE_LEN); + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break, + i - last_break); + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + free(rand_data); + + return OK; +} + +int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_256(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + + aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + + aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int test_gcm_strm_efence(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random efence test vectors with random stream:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector_efence(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 16); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = 0; // (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +int test_gcm_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 16); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +// +// place all data to end at a page boundary to check for read past the end +// +int test_gcm_efence(void) +{ + gcm_vector test; + int offset = 0; + gcm_key_size key_len; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + uint8_t *P, *C, *K, *IV, *A, *T; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + P = malloc(PAGE_LEN); + C = malloc(PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + A = malloc(PAGE_LEN); + T = malloc(PAGE_LEN); + if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A) + || (NULL == T) || (NULL == gkey) || (NULL == gctx)) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + + test.Plen = PAGE_LEN / 2; + // place buffers to end at page boundary + test.IVlen = GCM_IV_DATA_LEN; + test.Alen = test.Plen; + test.Tlen = MAX_TAG_LEN; + + printf("AES GCM efence test vectors:"); + for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len; + key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) { + test.Klen = key_len; + for (offset = 0; MAX_UNALIGNED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.Plen = (PAGE_LEN / 2) - offset; + test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes + //Place data at end of page + test.P = P + PAGE_LEN - test.Plen; + test.C = C + PAGE_LEN - test.Plen; + test.K = K + PAGE_LEN - test.Klen; + test.IV = IV + PAGE_LEN - test.IVlen; + test.A = A + PAGE_LEN - test.Alen; + test.T = T + PAGE_LEN - test.Tlen; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + if (GCM_128_KEY_LEN == key_len) { + if (0 != check_vector(gkey, gctx, &test)) + return 1; + } else { + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + } + } + } + free(gkey); + free(gctx); + free(P); + free(C); + free(K); + free(IV); + free(A); + free(T); + + printf("\n"); + return 0; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-128:\n"); +#endif + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->Plen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-256:\n"); +#endif + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)"); + OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)"); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors:\n"); + for (vect = 0; vect < vectors_cnt; vect++) { +#ifdef GCM_VECTORS_VERBOSE + printf + ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + + if (BITS_128 == gcm_vectors[vect].Klen) { + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + } else { + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + } + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +// The length of the data is set to length. The first stream is from 0 to start. After +// that the data is broken into breaks chunks of equal size (except possibly the last +// one due to divisibility). +int test_gcm_strm_combinations2(int length, int start, int breaks) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors of length %d and stream with %d breaks:", length, + breaks + 1); + for (t = 0; RANDOMS > t; t++) { + int Plen = length; + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); + errors += test_gcm256_combinations(); + errors += test_gcm_combinations(); + errors += test_gcm_efence(); + errors += test_gcm256_strm_combinations(TEST_LEN); + errors += test_gcm_strm_combinations(TEST_LEN); + errors += test_gcm256_strm_combinations(1024); + errors += test_gcm_strm_combinations(1024); + errors += test_gcm_strm_efence(); + errors += test_gcm_strm_combinations2(1024, 0, 1024); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c new file mode 100644 index 000000000..54581d6b6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c @@ -0,0 +1,659 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include // for memcmp +#include +#include "gcm_vectors.h" +#include "types.h" + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +void aes_gcm_stream_enc_128(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_128(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in verctors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_enc_128_finalize(key_data, context, auth_tag, auth_tag_len); +} + +void aes_gcm_stream_dec_128(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_128(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_dec_128_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) +int test_gcm128_std_stream_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + memset(gkey.expanded_keys, 0, sizeof(gkey.expanded_keys)); + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + + aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_stream_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_stream_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +void aes_gcm_stream_enc_256(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_256(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_enc_256_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +void aes_gcm_stream_dec_256(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_256(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_dec_256_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +int test_gcm256_std_stream_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_stream_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_stream_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} +#endif + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors new api:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector new api %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) +/** + * Stream API test with standard vectors + */ +int test_gcm_std_strm_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors stream api:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector stream api %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_stream_vectors(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_stream_vectors(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} +#endif +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) + errors += test_gcm_std_strm_vectors(); +#endif + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm new file mode 100644 index 000000000..dac7c5912 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm @@ -0,0 +1,4296 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; Tomasz Kantecki +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "clear_regs.asm" +%include "gcm_keys_vaes_avx512.asm" +%include "gcm_defines.asm" +%include "memcpy.asm" +%include "aes_common.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx512.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%if (AS_FEATURE_LEVEL) >= 10 + +section .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Stack frame definition +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE (10*16) ; space for 10 XMM registers + %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment +%else + %define XMM_STORAGE 0 + %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment +%endif +%ifdef GCM_BIG_DATA +%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks +%else +%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks +%endif + +;;; sequence is (bottom-up): GP, XMM, local +%define STACK_GP_OFFSET 0 +%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE) +%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE) +%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE) + +;; for compatibility with stack argument definitions in gcm_defines.asm +%define STACK_OFFSET 0 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 4 x 128bits xored together +%macro VHPXORI4x128 2 +%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] ZMM temporary register + vextracti64x4 YWORD(%%TMP), %%REG, 1 + vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP) + vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI4x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 2 x 128bits xored together +%macro VHPXORI2x128 2 +%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register + vextracti32x4 XWORD(%%TMP), %%REG, 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI2x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 1st step +%macro VCLMUL_STEP1 6-7 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] previous blocks 4 to 7 +%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %4 ; [out] high product +%define %%TM %5 ; [out] medium product +%define %%TL %6 ; [out] low product +%define %%HKEY %7 ; [in/optional] hash key for multiplication + +%if %0 == 6 + vmovdqu64 %%TMP, [%%KP + HashKey_4] +%else + vmovdqa64 %%TMP, %%HKEY +%endif + vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1 + vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1 + vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL] +%endmacro ; VCLMUL_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 2nd step +%macro VCLMUL_STEP2 9-11 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ghash high 128 bits +%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %7 ; [in] high product +%define %%TM %8 ; [in] medium product +%define %%TL %9 ; [in] low product +%define %%HKEY %10 ; [in/optional] hash key for multiplication +%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none) + +%if %0 == 9 + vmovdqu64 %%TMP0, [%%KP + HashKey_8] +%else + vmovdqa64 %%TMP0, %%HKEY +%endif + vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1 + vpxorq %%TH, %%TH, %%TMP2 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0 + vpxorq %%TL, %%TL, %%TMP2 + vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0 + vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0 + + ;; finish multiplications + vpsrldq %%TMP2, %%TM, 8 + vpxorq %%HI, %%TH, %%TMP2 + vpslldq %%TMP2, %%TM, 8 + vpxorq %%LO, %%TL, %%TMP2 + + ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8] + ;; note: (X1+Y0) handled elsewhere +%if %0 < 11 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%else +%if %%HXOR == 4 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%elif %%HXOR == 2 + VHPXORI2x128 %%HI, %%TMP2 + VHPXORI2x128 %%LO, %%TMP1 +%endif ; HXOR + ;; for HXOR == 1 there is nothing to be done +%endif ; !(%0 < 11) + ;; HIx holds top 128 bits + ;; LOx holds low 128 bits + ;; - further reductions to follow +%endmacro ; VCLMUL_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; AVX512 reduction macro +%macro VCLMUL_REDUCE 6 +%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128) +%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial +%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce +%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce +%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register +%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; first phase of the reduction + vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01 + vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs + vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; second phase of the reduction + vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00 + vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10 + vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts + + vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 1st step +%macro VCLMUL_1_TO_8_STEP1 8 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7 +%define %%TMP1 %3 ; [clobbered] ZMM temporary +%define %%TMP2 %4 ; [clobbered] ZMM temporary +%define %%TH %5 ; [out] ZMM high product +%define %%TM %6 ; [out] ZMM medium product +%define %%TL %7 ; [out] ZMM low product +%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%HI, %%TMP1 + VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2 +%elif %%NBLOCKS == 6 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2) +%elif %%NBLOCKS == 5 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2) +%else + vpxorq %%TH, %%TH + vpxorq %%TM, %%TM + vpxorq %%TL, %%TL +%endif +%endmacro ; VCLMUL_1_TO_8_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 2nd step +%macro VCLMUL_1_TO_8_STEP2 10 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ZMM ghash high 128bits +%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM temporary +%define %%TH %7 ; [in/clobbered] ZMM high sum +%define %%TM %8 ; [in/clobbered] ZMM medium sum +%define %%TL %9 ; [in/clobbered] ZMM low sum +%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_7] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 6 + vmovdqu64 %%TMP2, [%%KP + HashKey_6] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 5 + vmovdqu64 %%TMP2, [%%KP + HashKey_5] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 4 + vmovdqu64 %%TMP2, [%%KP + HashKey_4] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 3 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%LO, %%TMP1 + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 2 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \ + YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2 +%elif %%NBLOCKS == 1 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \ + XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1 +%else + vpxorq %%HI, %%HI + vpxorq %%LO, %%LO +%endif +%endmacro ; VCLMUL_1_TO_8_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH 1 to 16 blocks of cipher text +;;; - performs reduction at the end +;;; - can take intermediate GHASH sums as input +%macro GHASH_1_TO_16 20 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm" +%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm" +%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm" +%define %%AAD_HASH_IN %15 ; [in] input hash value +%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3 +%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7 +%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11 +%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15 +%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%HK %%T9 + +%assign hashk HashKey_ %+ %%NUM_BLOCKS +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + + vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN + +%assign first_result 1 + +%ifnidn %%GH, no_zmm +%ifnidn %%GM, no_zmm +%ifnidn %%GL, no_zmm + ;; GHASH sums passed in to be updated and + ;; reduced at the end + vmovdqa64 %%T0H, %%GH + vmovdqa64 %%T0L, %%GL + vmovdqa64 %%T0M1, %%GM + vpxorq %%T0M2, %%T0M2 +%assign first_result 0 +%endif +%endif +%endif + +%rep (blocks_left / 4) +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + vmovdqu64 %%HK, [%%KP + hashk] +%if first_result == 1 + vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1 +%assign first_result 0 +%else + vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif +%undef %%REG_IN +%assign reg_idx (reg_idx + 1) +%assign hashk (hashk + 64) +%assign blocks_left (blocks_left - 4) +%endrep + +%if blocks_left > 0 +;; There are 1, 2 or 3 blocks left to process. +;; It may also be that they are the only blocks to process. + +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + +%if first_result == 1 +;; Case where %%NUM_BLOCKS = 1, 2 or 3 +%xdefine %%OUT_H %%T0H +%xdefine %%OUT_L %%T0L +%xdefine %%OUT_M1 %%T0M1 +%xdefine %%OUT_M2 %%T0M2 +%else +%xdefine %%OUT_H %%T1H +%xdefine %%OUT_L %%T1L +%xdefine %%OUT_M1 %%T1M1 +%xdefine %%OUT_M2 %%T1M2 +%endif + +%if blocks_left == 1 + vmovdqu64 XWORD(%%HK), [%%KP + hashk] + vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%elif blocks_left == 2 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%else ; blocks_left == 3 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vinserti64x2 %%HK, [%%KP + hashk + 32], 2 + vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1 +%endif ; blocks_left + +%undef %%REG_IN +%undef %%OUT_H +%undef %%OUT_L +%undef %%OUT_M1 +%undef %%OUT_M2 + +%if first_result != 1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif + +%endif ; blocks_left > 0 + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T1M1 + VHPXORI4x128 %%T0L, %%T1M2 + + ;; reduction + vmovdqa64 XWORD(%%HK), [rel POLY2] + VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \ + XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2) +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +;;; Input: A and B (128-bits each, bit-reflected) +;;; Output: C = A*B*x mod poly, (i.e. >>1 ) +;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxorq %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxorq %%T1, %%T1, %%T3 + vpxorq %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu64 %%T3, [rel POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts + + ; second phase of the reduction complete, the result is in %%GH + vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512 +;;; functions, but are kept to allow users to switch cpu architectures between calls +;;; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + vmovdqa %%T5, %%HK + + ;; GHASH keys 2 to 48 or 128 +%ifdef GCM_BIG_DATA +%assign max_hkey_idx 128 +%else +%assign max_hkey_idx 48 +%endif + +%assign i 2 +%rep (max_hkey_idx - 1) + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly + vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5 +%assign i (i + 1) +%endrep + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; READ_SMALL_DATA_INPUT +;;; Packs xmm register with data when data input is less or equal to 16 bytes +;;; Returns 0 if data has length 0 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 5 +%define %%OUTPUT %1 ; [out] xmm register +%define %%INPUT %2 ; [in] buffer pointer to read from +%define %%LENGTH %3 ; [in] number of bytes to read +%define %%TMP1 %4 ; [clobbered] +%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask + + cmp %%LENGTH, 16 + jge %%_read_small_data_ge16 + lea %%TMP1, [rel byte_len_to_mask_table] +%ifidn __OUTPUT_FORMAT__, win64 + add %%TMP1, %%LENGTH + add %%TMP1, %%LENGTH + kmovw %%MASK, [%%TMP1] +%else + kmovw %%MASK, [%%TMP1 + %%LENGTH*2] +%endif + vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT] + jmp %%_read_small_data_end +%%_read_small_data_ge16: + VX512LDR %%OUTPUT, [%%INPUT] + mov %%TMP1, 0xffff + kmovq %%MASK, %%TMP1 +%%_read_small_data_end: +%endmacro ; READ_SMALL_DATA_INPUT + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 18 +%define %%A_IN %1 ; [in] AAD text pointer +%define %%A_LEN %2 ; [in] AAD length +%define %%AAD_HASH %3 ; [out] xmm ghash value +%define %%GDATA_KEY %4 ; [in] pointer to keys +%define %%ZT0 %5 ; [clobbered] ZMM register +%define %%ZT1 %6 ; [clobbered] ZMM register +%define %%ZT2 %7 ; [clobbered] ZMM register +%define %%ZT3 %8 ; [clobbered] ZMM register +%define %%ZT4 %9 ; [clobbered] ZMM register +%define %%ZT5 %10 ; [clobbered] ZMM register +%define %%ZT6 %11 ; [clobbered] ZMM register +%define %%ZT7 %12 ; [clobbered] ZMM register +%define %%ZT8 %13 ; [clobbered] ZMM register +%define %%ZT9 %14 ; [clobbered] ZMM register +%define %%T1 %15 ; [clobbered] GP register +%define %%T2 %16 ; [clobbered] GP register +%define %%T3 %17 ; [clobbered] GP register +%define %%MASKREG %18 ; [clobbered] mask register + +%define %%SHFMSK %%ZT9 +%define %%POLY %%ZT8 +%define %%TH %%ZT7 +%define %%TM %%ZT6 +%define %%TL %%ZT5 + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxorq %%AAD_HASH, %%AAD_HASH + + vmovdqa64 %%SHFMSK, [rel SHUF_MASK] + vmovdqa64 %%POLY, [rel POLY2] + +%%_get_AAD_loop128: + cmp %%T2, 128 + jl %%_exit_AAD_loop128 + + vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3) + vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7) + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + + VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL + VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL + + ;; result in %%ZT1(H):%%ZT2(L) + ;; reduce and put the result in AAD_HASH + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) + + sub %%T2, 128 + je %%_CALC_AAD_done + + add %%T1, 128 + jmp %%_get_AAD_loop128 + +%%_exit_AAD_loop128: + or %%T2, %%T2 + jz %%_CALC_AAD_done + + ;; prep mask source address + lea %%T3, [rel byte64_len_to_mask_table] + lea %%T3, [%%T3 + %%T2*8] + + ;; calculate number of blocks to ghash (including partial bytes) + add %%T2, 15 + and %%T2, -16 ; 1 to 8 blocks possible here + shr %%T2, 4 + cmp %%T2, 7 + je %%_AAD_blocks_7 + cmp %%T2, 6 + je %%_AAD_blocks_6 + cmp %%T2, 5 + je %%_AAD_blocks_5 + cmp %%T2, 4 + je %%_AAD_blocks_4 + cmp %%T2, 3 + je %%_AAD_blocks_3 + cmp %%T2, 2 + je %%_AAD_blocks_2 + cmp %%T2, 1 + je %%_AAD_blocks_1 + ;; fall through for 8 blocks + + ;; The flow of each of these cases is identical: + ;; - load blocks plain text + ;; - shuffle loaded blocks + ;; - xor in current hash value into block 0 + ;; - perform up multiplications with ghash keys + ;; - jump to reduction code +%%_AAD_blocks_8: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 8 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_7: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 7 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_6: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 6 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_5: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 5 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_4: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 4 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_3: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 3 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_2: + kmovq %%MASKREG, [%%T3] + vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 2 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_1: + kmovq %%MASKREG, [%%T3] + vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 1 + +%%_AAD_blocks_done: + ;; Multiplications have been done. Do the reduction now + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) +%%_CALC_AAD_done: + ;; result in AAD_HASH + +%endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; PARTIAL_BLOCK +;;; Handles encryption/decryption and the tag partial blocks between +;;; update calls. +;;; Requires the input data be at least 1 byte long. +;;; Output: +;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT), +;;; AAD_HASH and updated GDATA_CTX +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 22 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated) +%define %%AAD_HASH %7 ; [out] updated GHASH value +%define %%ENC_DEC %8 ; [in] cipher direction +%define %%GPTMP0 %9 ; [clobbered] GP temporary register +%define %%GPTMP1 %10 ; [clobbered] GP temporary register +%define %%GPTMP2 %11 ; [clobbered] GP temporary register +%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register +%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register +%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register +%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register +%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register +%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register +%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register +%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register +%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register +%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register +%define %%MASKREG %22 ; [clobbered] mask temporary register + +%define %%XTMP0 XWORD(%%ZTMP0) +%define %%XTMP1 XWORD(%%ZTMP1) +%define %%XTMP2 XWORD(%%ZTMP2) +%define %%XTMP3 XWORD(%%ZTMP3) +%define %%XTMP4 XWORD(%%ZTMP4) +%define %%XTMP5 XWORD(%%ZTMP5) +%define %%XTMP6 XWORD(%%ZTMP6) +%define %%XTMP7 XWORD(%%ZTMP7) +%define %%XTMP8 XWORD(%%ZTMP8) +%define %%XTMP9 XWORD(%%ZTMP9) + +%define %%LENGTH %%GPTMP0 +%define %%IA0 %%GPTMP1 +%define %%IA1 %%GPTMP2 + + mov %%LENGTH, [%%GDATA_CTX + PBlockLen] + or %%LENGTH, %%LENGTH + je %%_partial_block_done ;Leave Macro if no partial blocks + + READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG + + ;; XTMP1 = my_ctx_data.partial_block_enc_key + vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey] + vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey] + + ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes + ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16) + lea %%IA0, [rel SHIFT_MASK] + add %%IA0, %%LENGTH + vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask + vpshufb %%XTMP1, %%XTMP3 + +%ifidn %%ENC_DEC, DEC + ;; keep copy of cipher text in %%XTMP4 + vmovdqa64 %%XTMP4, %%XTMP0 +%endif + vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn) + + ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block + ;; Determine if partial block is not being filled and shift mask accordingly + mov %%IA1, %%PLAIN_CYPH_LEN + add %%IA1, %%LENGTH + sub %%IA1, 16 + jge %%_no_extra_mask + sub %%IA0, %%IA1 +%%_no_extra_mask: + ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1 + ;; - mask out bottom %%LENGTH bytes of %%XTMP1 + vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK] + vpand %%XTMP1, %%XTMP0 + +%ifidn %%ENC_DEC, DEC + vpand %%XTMP4, %%XTMP0 + vpshufb %%XTMP4, [rel SHUF_MASK] + vpshufb %%XTMP4, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP4 +%else + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP1 +%endif + cmp %%IA1, 0 + jl %%_partial_incomplete + + ;; GHASH computation for the last <16 Byte block + GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9 + + mov qword [%%GDATA_CTX + PBlockLen], 0 + + ;; Set %%IA1 to be the number of bytes to write out + mov %%IA0, %%LENGTH + mov %%LENGTH, 16 + sub %%LENGTH, %%IA0 + jmp %%_enc_dec_done + +%%_partial_incomplete: +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], %%IA0 +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif + mov %%LENGTH, %%PLAIN_CYPH_LEN + +%%_enc_dec_done: + ;; output encrypted Bytes + + lea %%IA0, [rel byte_len_to_mask_table] + kmovw %%MASKREG, [%%IA0 + %%LENGTH*2] + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH + +%ifidn %%ENC_DEC, ENC + ;; shuffle XTMP1 back to output as ciphertext + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 +%endif + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1 + add %%DATA_OFFSET, %%LENGTH +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL +;;; macro code. It is called only for data lenghts 128 and above. +;;; The flow is as follows: +;;; - encrypt the initial %%num_initial_blocks blocks (can be 0) +;;; - encrypt the next 8 blocks and stitch with +;;; GHASH for the first %%num_initial_blocks +;;; - the last 8th block can be partial (lengths between 129 and 239) +;;; - partial block ciphering is handled within this macro +;;; - top bytes of such block are cleared for +;;; the subsequent GHASH calculations +;;; - PBlockEncKey needs to be setup in case of multi-call +;;; - top bytes of the block need to include encrypted counter block so that +;;; when handling partial block case text is read and XOR'ed against it. +;;; This needs to be in un-shuffled format. + +%macro INITIAL_BLOCKS 26-27 +%define %%GDATA_KEY %1 ; [in] pointer to GCM keys +%define %%GDATA_CTX %2 ; [in] pointer to GCM context +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%LENGTH %5 ; [in/out] number of bytes to process +%define %%DATA_OFFSET %6 ; [in/out] data offset +%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%CTR %8 ; [in/out] XMM counter block +%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash +%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH +%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH +%define %%ZT3 %12 ; [clobbered] ZMM temporary +%define %%ZT4 %13 ; [clobbered] ZMM temporary +%define %%ZT5 %14 ; [clobbered] ZMM temporary +%define %%ZT6 %15 ; [clobbered] ZMM temporary +%define %%ZT7 %16 ; [clobbered] ZMM temporary +%define %%ZT8 %17 ; [clobbered] ZMM temporary +%define %%ZT9 %18 ; [clobbered] ZMM temporary +%define %%ZT10 %19 ; [clobbered] ZMM temporary +%define %%ZT11 %20 ; [clobbered] ZMM temporary +%define %%ZT12 %21 ; [clobbered] ZMM temporary +%define %%IA0 %22 ; [clobbered] GP temporary +%define %%IA1 %23 ; [clobbered] GP temporary +%define %%ENC_DEC %24 ; [in] ENC/DEC selector +%define %%MASKREG %25 ; [clobbered] mask register +%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask +%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes) + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T3 XWORD(%%ZT3) +%define %%T4 XWORD(%%ZT4) +%define %%T5 XWORD(%%ZT5) +%define %%T6 XWORD(%%ZT6) +%define %%T7 XWORD(%%ZT7) +%define %%T8 XWORD(%%ZT8) +%define %%T9 XWORD(%%ZT9) + +%define %%TH %%ZT10 +%define %%TM %%ZT11 +%define %%TL %%ZT12 + +;; determine if partial block code needs to be added +%assign partial_block_possible 1 +%if %0 > 26 +%ifidn %%PARTIAL_PRESENT, no_partial_block +%assign partial_block_possible 0 +%endif +%endif + +%if %%num_initial_blocks > 0 + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd %%T3, %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] +%endif + + ;; extract new counter value (%%T3) + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1) +%else + vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT5, %%ZT6, no_zmm, no_zmm + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT1, j, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; adjust data offset and length + sub %%LENGTH, (%%num_initial_blocks * 16) + add %%DATA_OFFSET, (%%num_initial_blocks * 16) + + ;; At this stage + ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed + +%endif ; %%num_initial_blocks > 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; - cipher of %%num_initial_blocks is done + ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4) + ;; - save the last block in %%CTR + ;; - shuffle the blocks for AES + ;; - stitch encryption of the new blocks with + ;; GHASHING the previous blocks + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] + vextracti32x4 %%CTR, %%ZT4, 3 + + vpshufb %%ZT3, %%SHUFMASK + vpshufb %%ZT4, %%SHUFMASK + +%if partial_block_possible != 0 + ;; get text load/store mask (assume full mask by default) + mov %%IA0, 0xffff_ffff_ffff_ffff +%if %%num_initial_blocks > 0 + ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; We know there is partial block if: + ;; LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jge %%_initial_partial_block_continue + mov %%IA1, rcx + mov rcx, 128 + sub rcx, %%LENGTH + shr %%IA0, cl + mov rcx, %%IA1 +%%_initial_partial_block_continue: +%endif + kmovq %%MASKREG, %%IA0 + ;; load plain or cipher text (masked) + ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG +%else + ;; load plain or cipher text + ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm +%endif ;; partial_block_possible + + ;; === AES ROUND 0 +%assign aes_round 0 + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;; === GHASH blocks 4-7 +%if (%%num_initial_blocks > 0) + ;; Hash in AES state + vpxorq %%ZT5, %%ZT5, %%AAD_HASH + + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [1/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH blocks 0-3 and gather +%if (%%num_initial_blocks > 0) + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \ + %%ZT7, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [2/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH reduction + +%if (%%num_initial_blocks > 0) + ;; [out] AAD_HASH - hash output + ;; [in] T8 - polynomial + ;; [in] T6 - high, T5 - low + ;; [clobbered] T9, T7 - temporary + vmovdqu64 %%T8, [rel POLY2] + VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9 +%endif + + ;; === [3/3] of AES rounds + +%rep (((NROUNDS + 1) / 3) + 2) +%if aes_round < (NROUNDS + 2) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif +%endrep ; %rep ((NROUNDS + 1) / 2) + +%if partial_block_possible != 0 + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG + ;; check if there is partial block + cmp %%LENGTH, 128 + jl %%_initial_save_partial + ;; adjust offset and length + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jmp %%_initial_blocks_done +%%_initial_save_partial: + ;; partial block case + ;; - save the partial block in unshuffled format + ;; - ZT4 is partially XOR'ed with data and top bytes contain + ;; encrypted counter block only + ;; - save number of bytes process in the partial block + ;; - adjust offset and zero the length + ;; - clear top bytes of the partial block for subsequent GHASH calculations + vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3 + add %%DATA_OFFSET, %%LENGTH + sub %%LENGTH, (128 - 16) + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + xor %%LENGTH, %%LENGTH + vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4 +%%_initial_blocks_done: +%else + ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 +%endif ;; partial_block_possible + + ;; Shuffle AES result for GHASH. +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT1 & ZT2 + vpshufb %%ZT1, %%SHUFMASK + vpshufb %%ZT2, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + vpshufb %%ZT1, %%ZT3, %%SHUFMASK + vpshufb %%ZT2, %%ZT4, %%SHUFMASK +%endif ; Encrypt + + ;; Current hash value is in AAD_HASH + + ;; Combine GHASHed value with the corresponding ciphertext + vpxorq %%ZT1, %%ZT1, %%AAD_HASH + +%endmacro ; INITIAL_BLOCKS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. +;;; It may look similar to INITIAL_BLOCKS but its usage is different: +;;; - first encrypts/decrypts required number of blocks and then +;;; ghashes these blocks +;;; - Small packets or left over data chunks (<256 bytes) +;;; - single or multi call +;;; - Remaining data chunks below 256 bytes (multi buffer code) +;;; +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 41 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer +%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer +%define %%LENGTH %5 ; [in/clobbered] length in bytes +%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated) +%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) +%define %%CTR %8 ; [in/out] current counter value +%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value +%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC) +%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call +%define %%ZT0 %12 ; [clobbered] ZMM temporary +%define %%ZT1 %13 ; [clobbered] ZMM temporary +%define %%ZT2 %14 ; [clobbered] ZMM temporary +%define %%ZT3 %15 ; [clobbered] ZMM temporary +%define %%ZT4 %16 ; [clobbered] ZMM temporary +%define %%ZT5 %17 ; [clobbered] ZMM temporary +%define %%ZT6 %18 ; [clobbered] ZMM temporary +%define %%ZT7 %19 ; [clobbered] ZMM temporary +%define %%ZT8 %20 ; [clobbered] ZMM temporary +%define %%ZT9 %21 ; [clobbered] ZMM temporary +%define %%ZT10 %22 ; [clobbered] ZMM temporary +%define %%ZT11 %23 ; [clobbered] ZMM temporary +%define %%ZT12 %24 ; [clobbered] ZMM temporary +%define %%ZT13 %25 ; [clobbered] ZMM temporary +%define %%ZT14 %26 ; [clobbered] ZMM temporary +%define %%ZT15 %27 ; [clobbered] ZMM temporary +%define %%ZT16 %28 ; [clobbered] ZMM temporary +%define %%ZT17 %29 ; [clobbered] ZMM temporary +%define %%ZT18 %30 ; [clobbered] ZMM temporary +%define %%ZT19 %31 ; [clobbered] ZMM temporary +%define %%ZT20 %32 ; [clobbered] ZMM temporary +%define %%ZT21 %33 ; [clobbered] ZMM temporary +%define %%ZT22 %34 ; [clobbered] ZMM temporary +%define %%GH %35 ; [in] ZMM ghash sum (high) +%define %%GL %36 ; [in] ZMM ghash sum (low) +%define %%GM %37 ; [in] ZMM ghash sum (middle) +%define %%IA0 %38 ; [clobbered] GP temporary +%define %%IA1 %39 ; [clobbered] GP temporary +%define %%MASKREG %40 ; [clobbered] mask register +%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T7 XWORD(%%ZT7) + +%define %%CTR0 %%ZT3 +%define %%CTR1 %%ZT4 +%define %%CTR2 %%ZT8 +%define %%CTR3 %%ZT9 + +%define %%DAT0 %%ZT5 +%define %%DAT1 %%ZT6 +%define %%DAT2 %%ZT10 +%define %%DAT3 %%ZT11 + +%ifnidn %%GH, no_zmm +%ifnidn %%GL, no_zmm +%ifnidn %%GM, no_zmm + ;; when temporary sums are passed then zero HASH IN value + ;; - whatever it holds it is invalid in this case + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT +%endif +%endif +%endif + ;; Copy ghash to temp reg + vmovdqa64 %%T2, %%HASH_IN_OUT + + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd XWORD(%%CTR0), %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234] +%if %%num_initial_blocks > 4 + vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678] +%endif +%if %%num_initial_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888] +%endif +%if %%num_initial_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888] +%endif +%endif + + ;; get load/store mask + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if %%num_initial_blocks > 12 + sub %%IA1, 3 * 64 +%elif %%num_initial_blocks > 8 + sub %%IA1, 2 * 64 +%elif %%num_initial_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASKREG, [%%IA0 + %%IA1*8] + + ;; extract new counter value + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZT1, j, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; retrieve the last cipher counter block (partially XOR'ed with text) + ;; - this is needed for partial block cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13) +%endif + + ;; write cipher/plain text back to output and + ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG + + ;; zero bytes outside the mask before hashing +%if %%num_initial_blocks <= 4 + vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 +%elif %%num_initial_blocks <= 8 + vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1 +%elif %%num_initial_blocks <= 12 + vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2 +%else + vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3 +%endif + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in CTR0-CTR3 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; Extract the last block for partials and multi_call cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1 +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5 +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9 +%else + vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13 +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; update data offset +%if %%num_initial_blocks > 1 + ;; The final block of data may be <16B + add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1) + sub %%LENGTH, 16 * (%%num_initial_blocks - 1) +%endif + +%if %%num_initial_blocks < 16 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Hash all of the data + + ;; ZT2 - incoming AAD hash (low 128bits) + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks + + jmp %%_small_initial_compute_done +%endif ; %if %%num_initial_blocks < 16 + +%%_small_initial_partial_block: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; Handle ghash for a <16B final block + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + ;; %%T1 is ciphered counter block + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1 + +%ifidn %%INSTANCE_TYPE, multi_call +%assign k (%%num_initial_blocks - 1) +%assign last_block_to_hash 1 +%else +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if (%%num_initial_blocks > last_block_to_hash) + + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k + + ;; just fall through no jmp needed +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + +%assign need_for_reduction 1 +%ifidn %%GH, no_zmm +%ifidn %%GL, no_zmm +%ifidn %%GM, no_zmm +;; if %%GH, %%GL & %%GM not passed then reduction is not required +%assign need_for_reduction 0 +%endif +%endif +%endif + +%if need_for_reduction == 0 + ;; The hash should end up in HASH_IN_OUT. + ;; The only way we should get here is if there is + ;; a partial block of data, so xor that into the hash. + vpxorq %%HASH_IN_OUT, %%T2, %%T7 +%else + ;; right - here we have nothing to ghash in the small data but + ;; we have GHASH sums passed through that we need to gather and reduce + + ;; integrate TM into TH and TL + vpsrldq %%ZT12, %%GM, 8 + vpslldq %%ZT13, %%GM, 8 + vpxorq %%GH, %%GH, %%ZT12 + vpxorq %%GL, %%GL, %%ZT13 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%GH, %%ZT12 + VHPXORI4x128 %%GL, %%ZT13 + + ;; reduction + vmovdqa64 XWORD(%%ZT12), [rel POLY2] + VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \ + XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14) + + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif + ;; The result is in %%HASH_IN_OUT + jmp %%_after_reduction +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; After GHASH reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_small_initial_compute_done: + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 16 + ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero + or %%LENGTH, %%LENGTH + je %%_after_reduction +%endif ; %%num_initial_blocks != 16 + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%%_after_reduction: + ;; Final hash is now in HASH_IN_OUT + +%endmacro ; INITIAL_BLOCKS_PARTIAL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 8 blocks at a time +;;; - ghash the 8 previously encrypted ciphertext blocks +;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output +;;; contains encrypted counter block. +%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3 +%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7 +%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3 +%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7 +%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES) +%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add" +%define %%ENC_DEC %11 ; [in] cipher direction +%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial" +%define %%IA0 %13 ; [clobbered] temporary GP register +%define %%IA1 %14 ; [clobbered] temporary GP register +%define %%LENGTH %15 ; [in] length +%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection +%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1 +%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5 +%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash) +%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores +%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction" +%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction" + +%define %%GH1H %%ZT10 +%define %%GH1L %%ZT11 +%define %%GH1M1 %%ZT12 +%define %%GH1M2 %%ZT13 + +%define %%GH2H %%ZT14 +%define %%GH2L %%ZT15 +%define %%GH2M1 %%ZT16 +%define %%GH2M2 %%ZT17 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; populate counter blocks for cipher part +%ifidn %%loop_idx, in_order + ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro + ;; it has to be kept in unshuffled format + vpshufb %%ZT1, %%CTR1, %%SHFMSK + vpshufb %%ZT2, %%CTR2, %%SHFMSK +%else + vmovdqa64 %%ZT1, %%CTR1 + vmovdqa64 %%ZT2, %%CTR2 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + +%assign aes_round 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;;================================================== + ;; GHASH 4 blocks + vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0 + vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; GHASH 4 blocks + vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1 + vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2 + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L +%endif +%ifidn %%DO_REDUCTION, do_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpxorq %%GH1M1, %%GH1M1, %%GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 2 x AES ROUND + + ;; ================================================= + ;; Add mid product to high and low then + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif +%ifidn %%DO_REDUCTION, do_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 +%endif +%ifnidn %%DO_REDUCTION, no_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; first phase of reduction +%ifnidn %%DO_REDUCTION, no_reduction + vmovdqu64 XWORD(%%GH2M2), [rel POLY2] + vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs + vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; second phase of the reduction +%ifnidn %%DO_REDUCTION, no_reduction + vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00 + vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10 + vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts + ;; ZT13 = ZT13 xor ZT16 xor GH1H + vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; all remaining AES rounds but the last +%rep (NROUNDS + 2) +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load/store mask (partial case) and load the text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%else + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH + sub %%IA1, 64 + kmovq %%MASKREG, [%%IA0 + 8*%%IA1] + vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store the cipher/plain text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 +%else + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2 +%endif + + ;; ================================================= + ;; prep cipher text blocks for the next ghash round + +%ifnidn %%FULL_PARTIAL, full +%ifidn %%INSTANCE_TYPE, multi_call + ;; for partial block & multi_call we need encrypted counter block + vpxorq %%ZT3, %%ZT2, %%ZT5 + vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3 +%endif + ;; for GHASH computation purpose clear the top bytes of the partial block +%ifidn %%ENC_DEC, ENC + vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2 +%else + vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5 +%endif +%endif ; %ifnidn %%FULL_PARTIAL, full + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK +%else + vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK +%endif + +%ifidn %%DO_REDUCTION, do_reduction + ;; ================================================= + ;; XOR current GHASH value (ZT13) into block 0 + vpxorq %%GHASHIN_AESOUT_B03, %%ZT13 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value (ZT13) in TO_REDUCE_L + vmovdqa64 %%TO_REDUCE_L, %%ZT13 +%endif + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 16 blocks at a time +;;; - ghash the 16 previously encrypted ciphertext blocks +;;; - no partial block or multi_call handling here +%macro GHASH_16_ENCRYPT_16_PARALLEL 42 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian +%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check +%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key +%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out +%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in +%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) +%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher) +%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher) +%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher) +%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT19 %29 ; [clobbered] temporary ZMM +%define %%ZT20 %30 ; [clobbered] temporary ZMM +%define %%ZT21 %31 ; [clobbered] temporary ZMM +%define %%ZT22 %32 ; [clobbered] temporary ZMM +%define %%ZT23 %33 ; [clobbered] temporary ZMM +%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian +%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian +%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum +%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum +%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum +%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time" +%define %%ENC_DEC %40 ; [in] cipher direction +%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset +%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in" + +%define %%B00_03 %%ZT1 +%define %%B04_07 %%ZT2 +%define %%B08_11 %%ZT3 +%define %%B12_15 %%ZT4 + +%define %%GH1H %%ZT5 ; @note: do not change this mapping +%define %%GH1L %%ZT6 +%define %%GH1M %%ZT7 +%define %%GH1T %%ZT8 + +%define %%GH2H %%ZT9 +%define %%GH2L %%ZT10 +%define %%GH2M %%ZT11 +%define %%GH2T %%ZT12 + +%define %%RED_POLY %%GH2T +%define %%RED_P1 %%GH2L +%define %%RED_T1 %%GH2H +%define %%RED_T2 %%GH2M + +%define %%GH3H %%ZT13 +%define %%GH3L %%ZT14 +%define %%GH3M %%ZT15 +%define %%GH3T %%ZT16 + +%define %%DATA1 %%ZT13 +%define %%DATA2 %%ZT14 +%define %%DATA3 %%ZT15 +%define %%DATA4 %%ZT16 + +%define %%AESKEY1 %%ZT17 +%define %%AESKEY2 %%ZT18 + +%define %%GHKEY1 %%ZT19 +%define %%GHKEY2 %%ZT20 +%define %%GHDAT1 %%ZT21 +%define %%GHDAT2 %%ZT22 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_16_blocks_overflow + vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_16_blocks_ok +%%_16_blocks_overflow: + vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%SHFMSK +%%_16_blocks_ok: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)] +%ifnidn %%GHASH_IN, no_ghash_in + vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%else + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%endif + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; save counter for the next round + ;; increment counter overflow check register + vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)] + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + + vpxorq %%B00_03, %%AESKEY1 + vpxorq %%B04_07, %%AESKEY1 + vpxorq %%B08_11, %%AESKEY1 + vpxorq %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)] + + ;;================================================== + ;; GHASH 4 blocks (15 to 12) + vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)] + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 1 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)] + + ;; ================================================= + ;; GHASH 4 blocks (11 to 8) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 2 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)] + + ;; ================================================= + ;; GHASH 4 blocks (7 to 4) + vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 3 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)] + + ;; ================================================= + ;; Gather (XOR) GHASH for 12 blocks + vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96 + vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96 + vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96 + vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 4 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load plain/cipher text (recycle GH3xx registers) + VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)] + VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)] + VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)] + VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 5 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)] + + ;; ================================================= + ;; GHASH 4 blocks (3 to 0) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 6 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)] + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, first_time + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM + vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH + vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL +%endif +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + ;; also load polynomial constant for reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96 + + vpsrldq %%GH2M, %%GH1M, 8 + vpslldq %%GH1M, %%GH1M, 8 + + vmovdqa64 XWORD(%%RED_POLY), [rel POLY2] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 7 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)] + + ;; ================================================= + ;; Add mid product to high and low +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 8 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)] + + ;; ================================================= + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 9 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS >= 11) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)] +%endif + ;; ================================================= + ;; first phase of reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs + vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds up to 11 (AES192) or 13 (AES256) + ;; AES128 is done +%if (NROUNDS >= 11) + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS == 13) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last) +%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last) + + ;; ================================================= + ;; second phase of the reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00 + vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10 + vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts + ;; GH1H = GH1H x RED_T1 x RED_T2 + vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round + vaesenclast %%B00_03, %%B00_03, %%AESKEY1 + vaesenclast %%B04_07, %%B04_07, %%AESKEY1 + vaesenclast %%B08_11, %%B08_11, %%AESKEY1 + vaesenclast %%B12_15, %%B12_15, %%AESKEY1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; XOR against plain/cipher text + vpxorq %%B00_03, %%B00_03, %%DATA1 + vpxorq %%B04_07, %%B04_07, %%DATA2 + vpxorq %%B08_11, %%B08_11, %%DATA3 + vpxorq %%B12_15, %%B12_15, %%DATA4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store cipher/plain text + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15 + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%B00_03, %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%B12_15, %%SHFMSK +%else + vpshufb %%B00_03, %%DATA1, %%SHFMSK + vpshufb %%B04_07, %%DATA2, %%SHFMSK + vpshufb %%B08_11, %%DATA3, %%SHFMSK + vpshufb %%B12_15, %%DATA4, %%SHFMSK +%endif + + ;; ================================================= + ;; store shuffled cipher text for ghashing + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15 + +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value through %%GH1H +%endif + +%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 8 ciphertext blocks. +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_8 10-13 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%AAD_HASH %10 ; [out] XMM hash value +%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum + + VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL + +%if %0 > 10 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL + + vmovdqa64 XWORD(%%ZT03), [rel POLY2] + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 7 cipher text blocks. +;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist +;;; - it loads GHASH keys for each of the data blocks, so that: +;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively +;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed +;;; (clmul product is zero this way and will not affect the result) +;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_7 13-16 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%ZT04 %10 ; [cloberred] ZMM temporary +%define %%AAD_HASH %11 ; [out] XMM hash value +%define %%MASKREG %12 ; [clobbered] mask register to use for loads +%define %%IA0 %13 ; [clobbered] GP temporary register +%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum + + vmovdqa64 XWORD(%%ZT04), [rel POLY2] + + VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7 + +%if %0 > 13 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \ + %%ZT01, %%ZT02, %%ZT03, \ + %%ZTH, %%ZTM, %%ZTL, 7 + + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_7 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save register content for the caller +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + mov rax, rsp + + sub rsp, STACK_FRAME_SIZE + and rsp, ~63 + + mov [rsp + STACK_GP_OFFSET + 0*8], r12 + mov [rsp + STACK_GP_OFFSET + 1*8], r13 + mov [rsp + STACK_GP_OFFSET + 2*8], r14 + mov [rsp + STACK_GP_OFFSET + 3*8], r15 + mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack + mov r14, rax ; r14 is used to retrieve stack args + mov [rsp + STACK_GP_OFFSET + 5*8], rbp + mov [rsp + STACK_GP_OFFSET + 6*8], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + STACK_GP_OFFSET + 7*8], rdi + mov [rsp + STACK_GP_OFFSET + 8*8], rsi +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6 + vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7 + vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8 + vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9 + vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10 + vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11 + vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12 + vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13 + vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14 + vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15 +%endif +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Restore register content for the caller +%macro FUNC_RESTORE 0 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_zmms_asm +%else + vzeroupper +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16] + vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16] + vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16] + vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16] + vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16] + vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16] + vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16] + vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16] + vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16] + vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16] +%endif + + ;; Required for Update/GMC_ENC + mov rbp, [rsp + STACK_GP_OFFSET + 5*8] + mov rbx, [rsp + STACK_GP_OFFSET + 6*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + STACK_GP_OFFSET + 7*8] + mov rsi, [rsp + STACK_GP_OFFSET + 8*8] +%endif + mov r12, [rsp + STACK_GP_OFFSET + 0*8] + mov r13, [rsp + STACK_GP_OFFSET + 1*8] + mov r14, [rsp + STACK_GP_OFFSET + 2*8] + mov r15, [rsp + STACK_GP_OFFSET + 3*8] + mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +;;; Additional Authentication data (A_IN), Additional Data length (A_LEN). +;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 21 +%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer +%define %%GDATA_CTX %2 ; [in] GCM context pointer +%define %%IV %3 ; [in] IV pointer +%define %%A_IN %4 ; [in] AAD pointer +%define %%A_LEN %5 ; [in] AAD length in bytes +%define %%GPR1 %6 ; [clobbered] GP register +%define %%GPR2 %7 ; [clobbered] GP register +%define %%GPR3 %8 ; [clobbered] GP register +%define %%MASKREG %9 ; [clobbered] mask register +%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14) +%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2) +%define %%ZT0 %12 ; [clobbered] ZMM register +%define %%ZT1 %13 ; [clobbered] ZMM register +%define %%ZT2 %14 ; [clobbered] ZMM register +%define %%ZT3 %15 ; [clobbered] ZMM register +%define %%ZT4 %16 ; [clobbered] ZMM register +%define %%ZT5 %17 ; [clobbered] ZMM register +%define %%ZT6 %18 ; [clobbered] ZMM register +%define %%ZT7 %19 ; [clobbered] ZMM register +%define %%ZT8 %20 ; [clobbered] ZMM register +%define %%ZT9 %21 ; [clobbered] ZMM register + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ + %%GPR1, %%GPR2, %%GPR3, %%MASKREG + + mov %%GPR1, %%A_LEN + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length + + xor %%GPR1, %%GPR1 + mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0 + + ;; read 12 IV bytes and pad with 0x00000001 + vmovdqu8 %%CUR_COUNT, [rel ONEf] + mov %%GPR2, %%IV + mov %%GPR1, 0x0000_0000_0000_0fff + kmovq %%MASKREG, %%GPR1 + vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 + + vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv + + ;; store IV as counter in LE format + vpshufb %%CUR_COUNT, [rel SHUF_MASK] + vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Cipher and ghash of payloads shorter than 256 bytes +;;; - number of blocks in the message comes as argument +;;; - depending on the number of blocks an optimized variant of +;;; INITIAL_BLOCKS_PARTIAL is invoked +%macro GCM_ENC_DEC_SMALL 42 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%DATA_OFFSET %7 ; [in] data offset +%define %%LENGTH %8 ; [in] data length +%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16 +%define %%CTR %10 ; [in/out] XMM counter block +%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value +%define %%INSTANCE_TYPE %12 ; [in] single or multi call +%define %%ZTMP0 %13 ; [clobbered] ZMM register +%define %%ZTMP1 %14 ; [clobbered] ZMM register +%define %%ZTMP2 %15 ; [clobbered] ZMM register +%define %%ZTMP3 %16 ; [clobbered] ZMM register +%define %%ZTMP4 %17 ; [clobbered] ZMM register +%define %%ZTMP5 %18 ; [clobbered] ZMM register +%define %%ZTMP6 %19 ; [clobbered] ZMM register +%define %%ZTMP7 %20 ; [clobbered] ZMM register +%define %%ZTMP8 %21 ; [clobbered] ZMM register +%define %%ZTMP9 %22 ; [clobbered] ZMM register +%define %%ZTMP10 %23 ; [clobbered] ZMM register +%define %%ZTMP11 %24 ; [clobbered] ZMM register +%define %%ZTMP12 %25 ; [clobbered] ZMM register +%define %%ZTMP13 %26 ; [clobbered] ZMM register +%define %%ZTMP14 %27 ; [clobbered] ZMM register +%define %%ZTMP15 %28 ; [clobbered] ZMM register +%define %%ZTMP16 %29 ; [clobbered] ZMM register +%define %%ZTMP17 %30 ; [clobbered] ZMM register +%define %%ZTMP18 %31 ; [clobbered] ZMM register +%define %%ZTMP19 %32 ; [clobbered] ZMM register +%define %%ZTMP20 %33 ; [clobbered] ZMM register +%define %%ZTMP21 %34 ; [clobbered] ZMM register +%define %%ZTMP22 %35 ; [clobbered] ZMM register +%define %%GH %36 ; [in] ZMM ghash sum (high) +%define %%GL %37 ; [in] ZMM ghash sum (low) +%define %%GM %38 ; [in] ZMM ghash sum (middle) +%define %%IA0 %39 ; [clobbered] GP register +%define %%IA1 %40 ; [clobbered] GP register +%define %%MASKREG %41 ; [clobbered] mask register +%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask + + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + jl %%_small_initial_num_blocks_is_7_1 + + + cmp %%NUM_BLOCKS, 12 + je %%_small_initial_num_blocks_is_12 + jl %%_small_initial_num_blocks_is_11_9 + + ;; 16, 15, 14 or 13 + cmp %%NUM_BLOCKS, 16 + je %%_small_initial_num_blocks_is_16 + cmp %%NUM_BLOCKS, 15 + je %%_small_initial_num_blocks_is_15 + cmp %%NUM_BLOCKS, 14 + je %%_small_initial_num_blocks_is_14 + jmp %%_small_initial_num_blocks_is_13 + +%%_small_initial_num_blocks_is_11_9: + ;; 11, 10 or 9 + cmp %%NUM_BLOCKS, 11 + je %%_small_initial_num_blocks_is_11 + cmp %%NUM_BLOCKS, 10 + je %%_small_initial_num_blocks_is_10 + jmp %%_small_initial_num_blocks_is_9 + +%%_small_initial_num_blocks_is_7_1: + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + jl %%_small_initial_num_blocks_is_3_1 + ;; 7, 6 or 5 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + jmp %%_small_initial_num_blocks_is_5 + +%%_small_initial_num_blocks_is_3_1: + ;; 3, 2 or 1 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed + + ;; Use rep to generate different block size variants + ;; - one block size has to be the first one +%assign num_blocks 1 +%rep 16 +%%_small_initial_num_blocks_is_ %+ num_blocks : + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \ + %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFMASK +%if num_blocks != 16 + jmp %%_small_initial_blocks_encrypted +%endif +%assign num_blocks (num_blocks + 1) +%endrep + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and zmm0-zmm31, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection + +%define %%IA0 r10 +%define %%IA1 r12 +%define %%IA2 r13 +%define %%IA3 r15 +%define %%IA4 r11 +%define %%IA5 rax + +%define %%LENGTH %%IA2 +%define %%CTR_CHECK %%IA3 +%define %%DATA_OFFSET %%IA4 + +%define %%HASHK_PTR %%IA5 + +%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now + +%define %%AES_PARTIAL_BLOCK xmm8 +%define %%CTR_BLOCK2z zmm18 +%define %%CTR_BLOCKz zmm9 +%define %%CTR_BLOCKx xmm9 +%define %%AAD_HASHz zmm14 +%define %%AAD_HASHx xmm14 + +;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL +%define %%ZTMP0 zmm0 +%define %%ZTMP1 zmm3 +%define %%ZTMP2 zmm4 +%define %%ZTMP3 zmm5 +%define %%ZTMP4 zmm6 +%define %%ZTMP5 zmm7 +%define %%ZTMP6 zmm10 +%define %%ZTMP7 zmm11 +%define %%ZTMP8 zmm12 +%define %%ZTMP9 zmm13 +%define %%ZTMP10 zmm15 +%define %%ZTMP11 zmm16 +%define %%ZTMP12 zmm17 + +;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL +;;; - some used by8 code as well through TMPxy names +%define %%ZTMP13 zmm19 +%define %%ZTMP14 zmm20 +%define %%ZTMP15 zmm21 +%define %%ZTMP16 zmm30 ; can be used in very/big_loop part +%define %%ZTMP17 zmm31 ; can be used in very/big_loop part +%define %%ZTMP18 zmm1 +%define %%ZTMP19 zmm2 +%define %%ZTMP20 zmm8 +%define %%ZTMP21 zmm22 +%define %%ZTMP22 zmm23 + +;;; Free to use: zmm24 - zmm29 +;;; - used by by128/48 and by8 +%define %%GH zmm24 +%define %%GL zmm25 +%define %%GM zmm26 +%define %%SHUF_MASK zmm29 +%define %%CTR_BLOCK_SAVE zmm28 + +;;; - used by by128/48 code only +%define %%ADDBE_4x4 zmm27 +%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE + +;; used by8 code only +%define %%GH4KEY %%ZTMP17 +%define %%GH8KEY %%ZTMP16 +%define %%BLK0 %%ZTMP18 +%define %%BLK1 %%ZTMP19 +%define %%ADD8BE zmm27 +%define %%ADD8LE %%ZTMP13 + +%define %%MASKREG k1 + +%ifdef GCM_BIG_DATA +;; reduction every 128 blocks, depth 32 blocks +;; @note 128 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is defined +%assign very_big_loop_nblocks 128 +%assign very_big_loop_depth 32 +%endif + +;; reduction every 48 blocks, depth 32 blocks +;; @note 48 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is not defined +%assign big_loop_nblocks 48 +%assign big_loop_depth 32 + +;;; Macro flow: +;;; - for message size bigger than very_big_loop_nblocks process data +;;; with "very_big_loop" parameters +;;; - for message size bigger than big_loop_nblocks process data +;;; with "big_loop" parameters +;;; - calculate the number of 16byte blocks in the message +;;; - process (number of 16byte blocks) mod 8 +;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%PLAIN_CYPH_LEN, 0 +%else + or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN +%endif + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + + ;; Update length of data processed +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], %%IA0 +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN +%endif + vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \ + %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG +%endif + + ;; lift counter block from GCM_INIT to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK +%else + vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in %%LENGTH + mov %%LENGTH, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + ;; There may be no more data if it was consumed in the partial block. + sub %%LENGTH, %%DATA_OFFSET + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + + vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] + vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444] + +%ifdef GCM_BIG_DATA + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_message_below_very_big_nblocks + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_no_more_very_big_nblocks + +%%_encrypt_very_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jge %%_encrypt_very_big_nblocks + +%%_no_more_very_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_very_big_nblocks: +%endif ; GCM_BIG_DATA + + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_message_below_big_nblocks + + ;; overwritten above by CTR_BLOCK_SAVE + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_no_more_big_nblocks + +%%_encrypt_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jge %%_encrypt_big_nblocks + +%%_no_more_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_big_nblocks: + + ;; Less than 256 bytes will be handled by the small message code, which + ;; can process up to 16 x blocks (16 bytes each) + cmp %%LENGTH, (16 * 16) + jge %%_large_message_path + + ;; Determine how many blocks to process + ;; - process one additional block if there is a partial block + mov %%IA1, %%LENGTH + add %%IA1, 15 + shr %%IA1, 4 + ;; %%IA1 can be in the range from 0 to 16 + + GCM_ENC_DEC_SMALL \ + %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \ + %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + no_zmm, no_zmm, no_zmm, \ + %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK + + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + jmp %%_ghash_done + +%%_large_message_path: + ;; Determine how many blocks to process in INITIAL + ;; - process one additional block in INITIAL if there is a partial block + mov %%IA1, %%LENGTH + and %%IA1, 0xff + add %%IA1, 15 + shr %%IA1, 4 + ;; Don't allow 8 INITIAL blocks since this will + ;; be handled by the x8 partial loop. + and %%IA1, 7 + je %%_initial_num_blocks_is_0 + cmp %%IA1, 1 + je %%_initial_num_blocks_is_1 + cmp %%IA1, 2 + je %%_initial_num_blocks_is_2 + cmp %%IA1, 3 + je %%_initial_num_blocks_is_3 + cmp %%IA1, 4 + je %%_initial_num_blocks_is_4 + cmp %%IA1, 5 + je %%_initial_num_blocks_is_5 + cmp %%IA1, 6 + je %%_initial_num_blocks_is_6 + +%assign number_of_blocks 7 +%rep 8 +%%_initial_num_blocks_is_ %+ number_of_blocks: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block +%if number_of_blocks != 0 + jmp %%_initial_blocks_encrypted +%endif +%assign number_of_blocks (number_of_blocks - 1) +%endrep + +%%_initial_blocks_encrypted: + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + ;; move cipher blocks from intial blocks to input of by8 macro + ;; and for GHASH_LAST_8/7 + ;; - ghash value already xor'ed into block 0 + vmovdqa64 %%BLK0, %%ZTMP0 + vmovdqa64 %%BLK1, %%ZTMP1 + + ;; The entire message cannot get processed in INITIAL_BLOCKS + ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks + ;; - INITIAL_BLOCKS processes up to 15 blocks + ;; - no need to check for zero length at this stage + + ;; In order to have only one reduction at the end + ;; start HASH KEY pointer needs to be determined based on length and + ;; call type. + ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and + ;; subtracted from LENGTH + lea %%IA1, [%%LENGTH + (8 * 16)] + add %%IA1, 15 + and %%IA1, 0x3f0 +%ifidn %%INSTANCE_TYPE, multi_call + ;; if partial block and multi_call then change hash key start by one + mov %%IA0, %%LENGTH + and %%IA0, 15 + add %%IA0, 15 + and %%IA0, 16 + sub %%IA1, %%IA0 +%endif + lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16] + sub %%HASHK_PTR, %%IA1 + ;; HASHK_PTR + ;; - points at the first hash key to start GHASH with + ;; - needs to be updated as the message is processed (incremented) + + ;; pre-load constants + vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888] + vmovdqa64 %%ADD8LE, [rel ddq_add_8888] + vpxorq %%GH, %%GH + vpxorq %%GL, %%GL + vpxorq %%GM, %%GM + + ;; prepare counter 8 blocks + vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0 + vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678] + vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234] + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + ;; Process 7 full blocks plus a partial block + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter + ;; without shuffling it back into little endian. + ;; %%CTR_CHECK keeps track of when we need to increment in order so + ;; that the carry is handled correctly. + + vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE) + +%%_encrypt_by_8_new: + and WORD(%%CTR_CHECK), 255 + add WORD(%%CTR_CHECK), 8 + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jz %%_encrypt_done + + cmp WORD(%%CTR_CHECK), (256 - 8) + jae %%_encrypt_by_8 + + vpaddd %%CTR_BLOCKz, %%ADD8BE + vpaddd %%CTR_BLOCK2z, %%ADD8BE + + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + + jmp %%_encrypt_by_8_new + +%%_encrypt_by_8: + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + vpaddd %%CTR_BLOCKz, %%ADD8LE + vpaddd %%CTR_BLOCK2z, %%ADD8LE + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + cmp %%LENGTH, 128 + jge %%_encrypt_by_8_new + +%%_encrypt_by_8_partial: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + ;; 'in_order' shuffle needed to align key for partial block xor. + ;; 'out_order' is a little faster because it avoids extra shuffles. + ;; - counter blocks for the next 8 blocks are prepared and in BE format + ;; - we can go ahead with out_order scenario + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, (128 - 16) + sub %%LENGTH, (128 - 16) + +%ifidn %%INSTANCE_TYPE, multi_call + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK +%endif + +%%_encrypt_done: + ;; Extract the last counter block in LE format + vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3 + vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK) + + ;; GHASH last cipher text blocks in xmm1-xmm8 + ;; - if block 8th is partial in a multi-call path then skip the block +%ifidn %%INSTANCE_TYPE, multi_call + cmp qword [%%GDATA_CTX + PBlockLen], 0 + jz %%_hash_last_8 + + ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1 + vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3 + + GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM + + ;; XOR the partial word into the hash + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7) + jmp %%_ghash_done +%%_hash_last_8: +%endif + GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \ + %%GH, %%GL, %%GM +%%_ghash_done: + vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE) + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx +%%_enc_dec_done: + +%endmacro ; GCM_ENC_DEC + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt/decrypt the initial 16 blocks +%macro INITIAL_BLOCKS_16 22 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%T0 %10 ; [clobered] temporary ZMM register +%define %%T1 %11 ; [clobered] temporary ZMM register +%define %%T2 %12 ; [clobered] temporary ZMM register +%define %%T3 %13 ; [clobered] temporary ZMM register +%define %%T4 %14 ; [clobered] temporary ZMM register +%define %%T5 %15 ; [clobered] temporary ZMM register +%define %%T6 %16 ; [clobered] temporary ZMM register +%define %%T7 %17 ; [clobered] temporary ZMM register +%define %%T8 %18 ; [clobered] temporary ZMM register +%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks +%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset + +%define %%B00_03 %%T5 +%define %%B04_07 %%T6 +%define %%B08_11 %%T7 +%define %%B12_15 %%T8 + +%assign stack_offset (%%BLK_OFFSET) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_next_16_overflow + vpaddd %%B00_03, %%CTR, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_next_16_ok +%%_next_16_overflow: + vpshufb %%CTR, %%CTR, %%SHUF_MASK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%SHUF_MASK +%%_next_16_ok: + vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;; === load 16 blocks of data + VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)] + VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)] + VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)] + VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)] + + ;; move to AES encryption rounds +%assign i 0 + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vpxorq %%B00_03, %%B00_03, %%T4 + vpxorq %%B04_07, %%B04_07, %%T4 + vpxorq %%B08_11, %%B08_11, %%T4 + vpxorq %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) + +%rep NROUNDS + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenc %%B00_03, %%B00_03, %%T4 + vaesenc %%B04_07, %%B04_07, %%T4 + vaesenc %%B08_11, %%B08_11, %%T4 + vaesenc %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) +%endrep + + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenclast %%B00_03, %%B00_03, %%T4 + vaesenclast %%B04_07, %%B04_07, %%T4 + vaesenclast %%B08_11, %%B08_11, %%T4 + vaesenclast %%B12_15, %%B12_15, %%T4 + + ;; xor against text + vpxorq %%B00_03, %%B00_03, %%T0 + vpxorq %%B04_07, %%B04_07, %%T1 + vpxorq %%B08_11, %%B08_11, %%T2 + vpxorq %%B12_15, %%B12_15, %%T3 + + ;; store + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15 + +%ifidn %%ENC_DEC, DEC + ;; decryption - cipher text needs to go to GHASH phase + vpshufb %%B00_03, %%T0, %%SHUF_MASK + vpshufb %%B04_07, %%T1, %%SHUF_MASK + vpshufb %%B08_11, %%T2, %%SHUF_MASK + vpshufb %%B12_15, %%T3, %%SHUF_MASK +%else + ;; encryption + vpshufb %%B00_03, %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%B12_15, %%SHUF_MASK +%endif + +%ifnidn %%GHASH, no_ghash + ;; === xor cipher block 0 with GHASH for the next GHASH round + vpxorq %%B00_03, %%B00_03, %%GHASH +%endif + + vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03 + vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07 + vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11 + vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15 +%endmacro ;INITIAL_BLOCKS_16 + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt the initial N x 16 blocks +;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth) +;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed +;;; - A + B = N +%macro INITIAL_BLOCKS_Nx16 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%T0 %8 ; [clobered] temporary ZMM register +%define %%T1 %9 ; [clobered] temporary ZMM register +%define %%T2 %10 ; [clobered] temporary ZMM register +%define %%T3 %11 ; [clobered] temporary ZMM register +%define %%T4 %12 ; [clobered] temporary ZMM register +%define %%T5 %13 ; [clobered] temporary ZMM register +%define %%T6 %14 ; [clobered] temporary ZMM register +%define %%T7 %15 ; [clobered] temporary ZMM register +%define %%T8 %16 ; [clobered] temporary ZMM register +%define %%T9 %17 ; [clobered] temporary ZMM register +%define %%T10 %18 ; [clobered] temporary ZMM register +%define %%T11 %19 ; [clobered] temporary ZMM register +%define %%T12 %20 ; [clobered] temporary ZMM register +%define %%T13 %21 ; [clobered] temporary ZMM register +%define %%T14 %22 ; [clobered] temporary ZMM register +%define %%T15 %23 ; [clobered] temporary ZMM register +%define %%T16 %24 ; [clobered] temporary ZMM register +%define %%T17 %25 ; [clobered] temporary ZMM register +%define %%T18 %26 ; [clobered] temporary ZMM register +%define %%T19 %27 ; [clobered] temporary ZMM register +%define %%T20 %28 ; [clobered] temporary ZMM register +%define %%T21 %29 ; [clobered] temporary ZMM register +%define %%T22 %30 ; [clobered] temporary ZMM register +%define %%GH %31 ; [out] ZMM ghash sum (high) +%define %%GL %32 ; [out] ZMM ghash sum (low) +%define %%GM %33 ; [out] ZMM ghash sum (middle) +%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16 +%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16) + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NBLOCKS +%assign data_in_out_offset 0 + + ;; set up CTR_CHECK + vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR) + and DWORD(%%CTR_CHECK), 255 + + ;; in LE format after init, convert to BE + vshufi64x2 %%CTR, %%CTR, %%CTR, 0 + vpshufb %%CTR, %%CTR, %%SHUF_MASK + + ;; ==== AES lead in + + ;; first 16 blocks - just cipher + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; ==== GHASH + AES follows + + ;; first 16 blocks stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%if ((%%NBLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + + ;; mid 16 blocks - stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + add %%DATA_OFFSET, (%%NBLOCKS * 16) + +%endmacro ;INITIAL_BLOCKS_Nx16 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code) +%macro GHASH_LAST_Nx16 23 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%T10 %12 ; [clobbered] temporary ZMM +%define %%T11 %13 ; [clobbered] temporary ZMM +%define %%T12 %14 ; [clobbered] temporary ZMM +%define %%T13 %15 ; [clobbered] temporary ZMM +%define %%T14 %16 ; [clobbered] temporary ZMM +%define %%T15 %17 ; [clobbered] temporary ZMM +%define %%T16 %18 ; [clobbered] temporary ZMM +%define %%GH %19 ; [in/cloberred] ghash sum (high) +%define %%GL %20 ; [in/cloberred] ghash sum (low) +%define %%GM %21 ; [in/cloberred] ghash sum (medium) +%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop +%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes) + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%T2H %%T9 +%define %%T2L %%T10 +%define %%T2M1 %%T11 +%define %%T2M2 %%T12 + +%define %%BLK1 %%T13 +%define %%BLK2 %%T14 + +%define %%HK1 %%T15 +%define %%HK2 %%T16 + +%assign hashk HashKey_ %+ %%DEPTH_BLK +%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16)) + + ;; load cipher blocks and ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH + vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL + vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM + vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2 + +%rep ((%%DEPTH_BLK - 8) / 8) +%assign hashk (hashk + 128) +%assign cipher_blk (cipher_blk + 128) + + ;; remaining blocks + ;; load next 8 cipher blocks and corresponding ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + ;; update sums + vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H + vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L + vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1 + vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2 +%endrep + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T2M1 + VHPXORI4x128 %%T0L, %%T2M2 + + ;; reduction + vmovdqa64 %%HK1, [rel POLY2] + VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2 +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt & ghash multiples of 16 blocks + +%macro GHASH_ENCRYPT_Nx16_PARALLEL 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%GDATA_KEY %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%CTR_BE %5 ; [in/out] ZMM last counter block +%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT0 %7 ; [clobered] temporary ZMM register +%define %%ZT1 %8 ; [clobered] temporary ZMM register +%define %%ZT2 %9 ; [clobered] temporary ZMM register +%define %%ZT3 %10 ; [clobered] temporary ZMM register +%define %%ZT4 %11 ; [clobered] temporary ZMM register +%define %%ZT5 %12 ; [clobered] temporary ZMM register +%define %%ZT6 %13 ; [clobered] temporary ZMM register +%define %%ZT7 %14 ; [clobered] temporary ZMM register +%define %%ZT8 %15 ; [clobered] temporary ZMM register +%define %%ZT9 %16 ; [clobered] temporary ZMM register +%define %%ZT10 %17 ; [clobered] temporary ZMM register +%define %%ZT11 %18 ; [clobered] temporary ZMM register +%define %%ZT12 %19 ; [clobered] temporary ZMM register +%define %%ZT13 %20 ; [clobered] temporary ZMM register +%define %%ZT14 %21 ; [clobered] temporary ZMM register +%define %%ZT15 %22 ; [clobered] temporary ZMM register +%define %%ZT16 %23 ; [clobered] temporary ZMM register +%define %%ZT17 %24 ; [clobered] temporary ZMM register +%define %%ZT18 %25 ; [clobered] temporary ZMM register +%define %%ZT19 %26 ; [clobered] temporary ZMM register +%define %%ZT20 %27 ; [clobered] temporary ZMM register +%define %%ZT21 %28 ; [clobered] temporary ZMM register +%define %%ZT22 %29 ; [clobered] temporary ZMM register +%define %%GTH %30 ; [in/out] ZMM GHASH sum (high) +%define %%GTL %31 ; [in/out] ZMM GHASH sum (low) +%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium) +%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value +%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop +%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks +%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16)) +%assign hkey_offset HashKey_ %+ %%DEPTH_BLK +%assign data_in_out_offset 0 + + ;; mid 16 blocks +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; 16 blocks with reduction + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS + + ;; === xor cipher block 0 with GHASH (ZT4) + vmovdqa64 %%GHASH, %%ZT4 + + ;; start the pipeline again + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + first_time, %%ENC_DEC, data_in_out_offset, %%GHASH + +%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16) + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) + + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + + add %%DATA_OFFSET, (%%NUM_BLOCKS * 16) + +%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL +;;; =========================================================================== + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + + cmp r11, 8 + je %%_T_8 + + simd_store_avx_15 r10, xmm9, r11, r12, rax + jmp %%_return_T_done +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: + +%ifdef SAFE_DATA + ;; Clear sensitive data from context structure + vpxor xmm0, xmm0 + vmovdqu [%%GDATA_CTX + AadHash], xmm0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 +%endif +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_vaes_avx512 / +; aes_gcm_precomp_192_vaes_avx512 / +; aes_gcm_precomp_256_vaes_avx512 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_precomp +%endif + + FUNC_SAVE + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + + FUNC_RESTORE +exit_precomp: + + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_init + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_init + + ;; Check IV != NULL + cmp arg3, 0 + jz exit_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + cmp arg4, 0 + jz exit_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + +exit_init: + + FUNC_RESTORE + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 / +; aes_gcm_enc_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_enc + +skip_in_out_check_update_enc: +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + +exit_update_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 / +; aes_gcm_dec_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_dec + +skip_in_out_check_update_dec: +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + +exit_update_dec: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 / +; aes_gcm_enc_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_enc_fin + + cmp arg4, 16 + ja exit_enc_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + + FUNC_RESTORE + +exit_enc_fin: + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512 +; aes_gcm_dec_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_dec_fin + + cmp arg4, 16 + ja exit_dec_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + + FUNC_RESTORE + +exit_dec_fin: + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_enc + + cmp arg10, 16 + ja exit_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + +exit_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_dec + + cmp arg10, 16 + ja exit_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_dec + +skip_aad_check_dec: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + +exit_dec: + FUNC_RESTORE + ret + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_ %+ FN_NAME(avx512,_) +no_ %+ FN_NAME(avx512,_) %+ : +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h new file mode 100644 index 000000000..8287198ae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h @@ -0,0 +1,476 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_GCM_VECTORS_H_ +#define AES_GCM_VECTORS_H_ + +#include + +typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size; +#define KBITS(K) (sizeof(K)) + +// struct to hold pointers to the key, plaintext and ciphertext vectors +typedef struct gcm_vector { + uint8_t* K; // AES Key + gcm_key_size Klen; // length of key in bits + uint8_t* IV; // initial value used by GCM + uint64_t IVlen; // length of IV in bytes + uint8_t* A; // additional authenticated data + uint64_t Alen; // length of AAD in bytes + uint8_t* P; // Plain text + uint64_t Plen; // length of our plaintext + //outputs of encryption + uint8_t* C; // same length as PT + uint8_t* T; // Authentication tag + uint8_t Tlen; // AT length can be 0 to 128bits +} gcm_vector; + +/////// +// 60-Byte Packet Encryption Using GCM-AES-128 +// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf +// K: AD7A2BD03EAC835A6F620FDCB506B345 +// IV: 12153524C0895E81B2C28465 +// AAD: D609B1F056637A0D46DF998D88E52E00 +// B2C2846512153524C0895E81 +// P: 08000F101112131415161718191A1B1C +// 1D1E1F202122232425262728292A2B2C +// 2D2E2F303132333435363738393A0002 +// C: 701AFA1CC039C0D765128A665DAB6924 +// 3899BF7318CCDC81C9931DA17FBE8EDD +// 7D17CB8B4C26FC81E3284F2B7FBA713D +// AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880 +// H: 73A23D80121DE2D5A850253FCF43120E +/////// +static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45}; +static uint8_t P1[] = { + 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C + , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C + , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02 +}; +static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65}; +static uint8_t A1[] = { + 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00 + , 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81 +}; +#define A1_len sizeof(A1) +static uint8_t C1[] = { + 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24 + , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD + , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D +}; +static uint8_t T1[] = { + 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80 +}; + + +/////// +// 54-Byte Packet Encryption Using GCM-AES-128 +// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf +// K: 071B113B0CA743FECCCF3D051F737382 +// IV: F0761E8DCD3D000176D457ED +// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A +// 76D457ED +// P: 08000F101112131415161718191A1B1C +// 1D1E1F202122232425262728292A2B2C +// 2D2E2F30313233340004 +// C: 13B4C72B389DC5018E72A171DD85A5D3 +// 752274D3A019FBCAED09A425CD9B2E1C +// 9B72EEE7C9DE7D52B3F3 +// AT: D6A5284F4A6D3FE22A5D6C2B960494C3 +// H: E4E01725D724C1215C7309AD34539257 +/////// +static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82}; +static uint8_t P2[] = { + 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C + , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C + , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04 +}; +static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED}; +//static uint8_t IV1p[] = {0, 0, 0, 1}; +static uint8_t A2[] = { + 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A + , 0x76, 0xD4, 0x57, 0xED +}; +#define A2_len sizeof(A2) +static uint8_t C2[] = { + 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3 + , 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C + , 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3 +}; +static uint8_t T2[] = { + 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3 +}; + + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 128] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K: c939cc13397c1d37de6ae0e1cb7c423c +// IV: b3d8cc017cbb89b39e0f67e2 +// P: c3b3c41f113a31b73d9a5cd432103069 +// AAD: 24825602bd12a984e0092d3e448eda5f +// C: 93fe7d9e9bfd10348a5606e5cafa7354 +// AT: 0032a1dc85f1c9786925a2e71d8272dd +/////// +static uint8_t K3[] = {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c}; +static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2}; +static uint8_t P3[] = {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69}; +static uint8_t A3[] = {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f}; +#define A3_len sizeof(A3) +static uint8_t C3[] = {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54}; +static uint8_t T3[] = {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd}; + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 256] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K = 298efa1ccf29cf62ae6824bfc19557fc +// IV = 6f58a93fe1d207fae4ed2f6d +// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 +// AAD = 021fafd238463973ffe80256e5b1c6b1 +// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db +// T = 542465ef599316f73a7a560509a2d9f2 +/////// +static uint8_t K4[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc}; +static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d}; +static uint8_t P4[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01}; +static uint8_t A4[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1}; +#define A4_len sizeof(A4) +static uint8_t C4[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb}; +static uint8_t T4[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2}; + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 256] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K = 298efa1ccf29cf62ae6824bfc19557fc +// IV = 6f58a93fe1d207fae4ed2f6d +// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 +// AAD = 021fafd238463973ffe80256e5b1c6b1 +// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db +// T = 542465ef599316f73a7a560509a2d9f2 +/////// +static uint8_t K5[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc}; +static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d}; +static uint8_t P5[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01}; +static uint8_t A5[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1}; +#define A5_len sizeof(A5) +static uint8_t C5[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb}; +static uint8_t T5[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2}; + + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 2 +// K: 00000000000000000000000000000000 +// P: 00000000000000000000000000000000 +// IV: 000000000000000000000000 +// C: 0388dace60b6a392f328c2b971b2fe78 +// T: ab6e47d42cec13bdf53a67b21257bddf +// H: 66e94bd4ef8a2c3b884cfa59ca342b2e +/////// +static uint8_t K6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t P6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t A6[] = {0}; +#define A6_len 0 +static uint8_t C6[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78}; +static uint8_t T6[] = {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf}; + + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 3 +// K: feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b391aafd255 +// IV: cafebabefacedbaddecaf888 +// H: b83b533708bf535d0aa6e52980d53b78 +// C: 42831ec2217774244b7221b784d0d49c +// e3aa212f2c02a4e035c17e2329aca12e +// 21d514b25466931c7d8f6a5aac84aa05 +// 1ba30b396a0aac973d58e091473f5985 +// T: 4d5c2af327cd64a62cf35abd2ba6fab4 +/////// +static uint8_t K7[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P7[] = {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a + , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72 + , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25 + , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55}; +static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t A7[] = {0}; +#define A7_len 0 +static uint8_t C7[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c + , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e + , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05 + , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85}; +static uint8_t T7[] = {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 4 +// K: feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbaddecaf888 +// H: b83b533708bf535d0aa6e52980d53b78 +// C: 42831ec2217774244b7221b784d0d49c +// e3aa212f2c02a4e035c17e2329aca12e +// 21d514b25466931c7d8f6a5aac84aa05 +// 1ba30b396a0aac973d58e091 +// T: 5bc94fbc3221a5db94fae95ae7121a47 +/////// +static uint8_t K8[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P8[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a + , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72 + , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25 + , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39}; +static uint8_t A8[] = {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef + , 0xab, 0xad, 0xda, 0xd2}; +#define A8_len sizeof(A8) +static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C8[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c + , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e + , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05 + , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85}; +static uint8_t T8[] = {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 14 +// K: 00000000000000000000000000000000 +// 00000000000000000000000000000000 +// P: 00000000000000000000000000000000 +// A: +// IV: 000000000000000000000000 +// H: dc95c078a2408989ad48a21492842087 +// C: cea7403d4d606b6e074ec5d3baf39d18 +// T: d0d1c8a799996bf0265b98b5d48ab919 +/////// +static uint8_t K9[] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; +static uint8_t P9[] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, +}; +static uint8_t A9[] = {0}; +#define A9_len 0 +static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; +static uint8_t C9[] = { + 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18 +}; +static uint8_t T9[] = {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 15 +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b391aafd255 +// A: +// IV: cafebabefacedbaddecaf888 +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 522dc1f099567d07f47f37a32a84427d +// 643a8cdcbfe5c0c97598a2bd2555d1aa +// 8cb08e48590dbb3da7b08b1056828838 +// c5f61e6393ba7a0abcc9f662898015ad +// T: b094dac5d93471bdec1a502270e3cc6c +/////// +static uint8_t K10[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P10[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; +static uint8_t A10[] = {0}; +#define A10_len 0 +static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C10[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; +static uint8_t T10[] = { + 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 16 +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbaddecaf888 +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 522dc1f099567d07f47f37a32a84427d +// 643a8cdcbfe5c0c97598a2bd2555d1aa +// 8cb08e48590dbb3da7b08b1056828838 +// c5f61e6393ba7a0abcc9f662 +// T: 76fc6ece0f4e1768cddf8853bb2d551b +/////// +static uint8_t K11[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P11[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 +}; +static uint8_t A11[] = { + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xab, 0xad, 0xda, 0xd2}; +#define A11_len sizeof(A11) +static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C11[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62 +}; +static uint8_t T11[] = {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 17 -- Not supported IV length less than 12 bytes +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbad +// H: acbef20579b4b8ebce889bac8732dad7 +// C: c3762df1ca787d32ae47c13bf19844cb +// af1ae14d0b976afac52ff7d79bba9de0 +// feb582d33934a4f0954cc2363bc73f78 +// 62ac430e64abe499f47c9b1f +// T: 3a337dbf46a792c45e454913fe2ea8f2 +/////// +//static uint8_t K12[] = { +// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, +// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +//static uint8_t P12[] = { +// 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +// 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +// 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +// 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 +//}; +//static uint8_t A12[] = { +// 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +// 0xab, 0xad, 0xda, 0xd2}; +//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad}; +//static uint8_t H12[] = { +// 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7}; +//static uint8_t C12[] = { +// 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb, +// 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0, +// 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78, +// 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f +//}; +//static uint8_t T12[] = { +// 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 18 -- Not supported IV length greater than 12 bytes +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: 9313225df88406e555909c5aff5269aa +// 6a7a9538534f7da1e4c303d2a318a728 +// c3c0c95156809539fcf0e2429a6b5254 +// 16aedbf5a0de6a57a637b39b +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 5a8def2f0c9e53f1f75d7853659e2a20 +// eeb2b22aafde6419a058ab4f6f746bf4 +// 0fc0c3b780f244452da3ebf1c5d82cde +// a2418997200ef82e44ae7e3f +// T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a +/////// + + +#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)} + +gcm_vector const gcm_vectors[] = { + //field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen}; + // original vector does not have a valid sub hash key + vector(1), + vector(2), + vector(3), + vector(4), + vector(5), + vector(6), + vector(7), + vector(8), + vector(9), + vector(10), + vector(11), + /* vector(12), -- IV of less than 16bytes are not supported */ +}; + +#endif /* AES_GCM_VECTORS_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm new file mode 100644 index 000000000..ddae6a4e7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm @@ -0,0 +1,328 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Routine to do AES key expansion + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +%macro key_expansion_128_sse 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 +%endmacro + +%macro key_expansion_128_avx 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 +%endmacro + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + + +; void aes_keyexp_128(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_128_sse, function +aes_keyexp_128_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory + pxor xmm3, xmm3 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*1], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*9], xmm4 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*8], xmm5 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*3], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*7], xmm4 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*6], xmm5 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*5], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*5], xmm4 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*4], xmm5 + + aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*7], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*3], xmm4 + + aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*2], xmm5 + + aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*9], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*1], xmm4 + + aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +mk_global aes_keyexp_128_avx, function +aes_keyexp_128_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory + vpxor xmm3, xmm3, xmm3 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*1], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*9], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*8], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*3], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*7], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*6], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*5], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*5], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*4], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*7], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*3], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*2], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*9], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*1], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; void aes_keyexp_128_enc_sse(UINT8 *key, +; UINT8 *enc_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; +mk_global aes_keyexp_128_enc_sse, function +aes_keyexp_128_enc_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + pxor xmm3, xmm3 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*1], xmm1 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*3], xmm1 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*5], xmm1 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + + aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*7], xmm1 + + aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + + aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*9], xmm1 + + aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + + ret + +mk_global aes_keyexp_128_enc_avx, function +aes_keyexp_128_enc_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vpxor xmm3, xmm3, xmm3 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*1], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*3], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*5], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*7], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*9], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + + ret + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm new file mode 100644 index 000000000..7cde5fb67 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm @@ -0,0 +1,274 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + + + + +%macro key_expansion_1_192_sse 1 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [EXP_ENC_KEYS+%1], xmm1 +%endmacro + +; Calculate w10 and w11 using calculated w9 and known w4-w5 +%macro key_expansion_2_192_sse 1 + movdqu xmm5, xmm4 + pslldq xmm5, 4 + shufps xmm6, xmm1, 11110000b + pxor xmm6, xmm5 + pxor xmm4, xmm6 + pshufd xmm7, xmm4, 00001110b + movdqu [EXP_ENC_KEYS+%1], xmm7 +%endmacro + +%macro key_dec_192_sse 1 + movdqu xmm0, [EXP_ENC_KEYS + 16 * %1] + aesimc xmm1, xmm0 + movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 +%endmacro + + + + + +%macro key_expansion_1_192_avx 1 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 + vmovdqu [EXP_ENC_KEYS+%1], xmm1 +%endmacro + +; Calculate w10 and w11 using calculated w9 and known w4-w5 +%macro key_expansion_2_192_avx 1 + vmovdqa xmm5, xmm4 + vpslldq xmm5, xmm5, 4 + vshufps xmm6, xmm6, xmm1, 11110000b + vpxor xmm6, xmm6, xmm5 + vpxor xmm4, xmm4, xmm6 + vpshufd xmm7, xmm4, 00001110b + vmovdqu [EXP_ENC_KEYS+%1], xmm7 +%endmacro + +%macro key_dec_192_avx 1 + vmovdqu xmm0, [EXP_ENC_KEYS + 16 * %1] + vaesimc xmm1, xmm0 + vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 +%endmacro + + + + +; void aes_keyexp_192(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_192_sse, function +aes_keyexp_192_sse: + endbranch + +%ifnidn __OUTPUT_FORMAT__, elf64 + sub rsp, 16*2 + 8 + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 +%endif + + movq xmm7, [KEY + 16] ; loading the AES key, 64 bits + movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion + pshufd xmm4, xmm7, 01001111b + movdqu xmm1, [KEY] ; loading the AES key, 128 bits + movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*12], xmm1 + + pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion. + pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion. + + aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 + key_expansion_1_192_sse 24 + key_expansion_2_192_sse 40 + + aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 + key_expansion_1_192_sse 48 + key_expansion_2_192_sse 64 + + aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 + key_expansion_1_192_sse 72 + key_expansion_2_192_sse 88 + + aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 + key_expansion_1_192_sse 96 + key_expansion_2_192_sse 112 + + aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 + key_expansion_1_192_sse 120 + key_expansion_2_192_sse 136 + + aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 + key_expansion_1_192_sse 144 + key_expansion_2_192_sse 160 + + aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 + key_expansion_1_192_sse 168 + key_expansion_2_192_sse 184 + + aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 + key_expansion_1_192_sse 192 + +;;; we have already saved the 12 th key, which is pure input on the +;;; ENC key path + movdqu xmm0, [EXP_ENC_KEYS + 16 * 12] + movdqu [EXP_DEC_KEYS + 16*0], xmm0 +;;; generate remaining decrypt keys + key_dec_192_sse 1 + key_dec_192_sse 2 + key_dec_192_sse 3 + key_dec_192_sse 4 + key_dec_192_sse 5 + key_dec_192_sse 6 + key_dec_192_sse 7 + key_dec_192_sse 8 + key_dec_192_sse 9 + key_dec_192_sse 10 + key_dec_192_sse 11 + +%ifnidn __OUTPUT_FORMAT__, elf64 + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + add rsp, 16*2 + 8 +%endif + + ret + + + +mk_global aes_keyexp_192_avx, function +aes_keyexp_192_avx: + endbranch + +%ifnidn __OUTPUT_FORMAT__, elf64 + sub rsp, 16*2 + 8 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 +%endif + + vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits + vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion + vpshufd xmm4, xmm7, 01001111b + vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits + vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*12], xmm1 + + vpxor xmm3, xmm3, xmm3 + vpxor xmm6, xmm6, xmm6 + + vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 + key_expansion_1_192_avx 24 + key_expansion_2_192_avx 40 + + vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 + key_expansion_1_192_avx 48 + key_expansion_2_192_avx 64 + + vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 + key_expansion_1_192_avx 72 + key_expansion_2_192_avx 88 + + vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 + key_expansion_1_192_avx 96 + key_expansion_2_192_avx 112 + + vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 + key_expansion_1_192_avx 120 + key_expansion_2_192_avx 136 + + vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 + key_expansion_1_192_avx 144 + key_expansion_2_192_avx 160 + + vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 + key_expansion_1_192_avx 168 + key_expansion_2_192_avx 184 + + vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 + key_expansion_1_192_avx 192 + +;;; we have already saved the 12 th key, which is pure input on the +;;; ENC key path + vmovdqu xmm0, [EXP_ENC_KEYS + 16 * 12] + vmovdqu [EXP_DEC_KEYS + 16*0], xmm0 +;;; generate remaining decrypt keys + key_dec_192_avx 1 + key_dec_192_avx 2 + key_dec_192_avx 3 + key_dec_192_avx 4 + key_dec_192_avx 5 + key_dec_192_avx 6 + key_dec_192_avx 7 + key_dec_192_avx 8 + key_dec_192_avx 9 + key_dec_192_avx 10 + key_dec_192_avx 11 + +%ifnidn __OUTPUT_FORMAT__, elf64 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + add rsp, 16*2 + 8 +%endif + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm new file mode 100644 index 000000000..9b3eb7688 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm @@ -0,0 +1,286 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +; Routine to do AES key expansion + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; Uses the f() function of the aeskeygenassist result +%macro key_expansion_256_sse 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 +%endmacro + +; Uses the SubWord function of the aeskeygenassist result +%macro key_expansion_256_sse_2 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 10101010b + shufps xmm3, xmm4, 00010000b + pxor xmm4, xmm3 + shufps xmm3, xmm4, 10001100b + pxor xmm4, xmm3 + pxor xmm4, xmm2 +%endmacro + +; Uses the f() function of the aeskeygenassist result +%macro key_expansion_256_avx 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 +%endmacro + +; Uses the SubWord function of the aeskeygenassist result +%macro key_expansion_256_avx_2 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 10101010b + vshufps xmm3, xmm3, xmm4, 00010000b + vpxor xmm4, xmm4, xmm3 + vshufps xmm3, xmm3, xmm4, 10001100b + vpxor xmm4, xmm4, xmm3 + vpxor xmm4, xmm4, xmm2 +%endmacro + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + +; void aes_keyexp_256(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_256_sse, function +aes_keyexp_256_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory + + movdqu xmm4, [KEY+16] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*1], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory + + pxor xmm3, xmm3 ; Required for the key_expansion. + + aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*12], xmm5 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*3], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*11], xmm0 + + aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*10], xmm5 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*5], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*9], xmm0 + + aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*8], xmm5 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*7], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*7], xmm0 + + aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*6], xmm5 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*9], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*5], xmm0 + + aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*4], xmm5 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*11], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*3], xmm0 + + aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*12], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*2], xmm5 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*13], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*1], xmm0 + + aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*14], xmm1 + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + + +mk_global aes_keyexp_256_avx, function +aes_keyexp_256_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory + + vmovdqu xmm4, [KEY+16] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*1], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory + + vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion. + + vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*12], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*3], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*11], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*10], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*5], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*9], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*8], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*7], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*7], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*6], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*9], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*5], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*4], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*11], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*3], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*12], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*2], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*13], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*1], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*14], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm new file mode 100644 index 000000000..045649a64 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm @@ -0,0 +1,68 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_keyexp_128_sse +extern aes_keyexp_128_avx +extern aes_keyexp_128_enc_sse +extern aes_keyexp_128_enc_avx + +extern aes_keyexp_192_sse +extern aes_keyexp_192_avx + +extern aes_keyexp_256_sse +extern aes_keyexp_256_avx + +%include "multibinary.asm" + + +;;;; +; instantiate aes_keyexp_128 interfaces +;;;; +mbin_interface aes_keyexp_128 +mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx + +mbin_interface aes_keyexp_128_enc +mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx + +mbin_interface aes_keyexp_192 +mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx + +mbin_interface aes_keyexp_256 +mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx + +section .text +;;; func core, ver, snum +slversion aes_keyexp_128, 00, 01, 02a1 +slversion aes_keyexp_192, 00, 01, 02a2 +slversion aes_keyexp_256, 00, 01, 02a3 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h new file mode 100644 index 000000000..80c6e1e87 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h @@ -0,0 +1,302 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_OSSL_HELPER_H_ +#define AES_OSSL_HELPER_H_ + +#ifdef _MSC_VER +# define inline __inline +#endif + +#include + +static inline + int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate(ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, &plaintext[outlen], &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len, + outlen, tmplen); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen, + tmplen); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext, + int len, uint8_t * plaintext) +{ + int outlen = 0, tmplen = len, ret; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - key init\n"); + if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + + ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen); + if (0 < ret) { + tmplen += outlen; + } else { + //Authentication failed mismatched key, ADD or tag + tmplen = -1; + } + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext, + int len, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + //printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len); + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - init\n"); + if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n"); + if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext, + int len, uint8_t * plaintext) +{ + int outlen = 0, tmplen = len, ret; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - key init\n"); + if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen); + if (0 < ret) { + tmplen += outlen; + } else { + //Authentication failed mismatched key, ADD or tag + tmplen = -1; + } + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext, + int len, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - init\n"); + if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n"); + if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +#endif /* AES_OSSL_HELPER_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c new file mode 100644 index 000000000..5dc898992 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c @@ -0,0 +1,143 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt, *refdt; + unsigned char keyssl[32]; /* SSL takes both keys together */ + struct perf start, stop; + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_128_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + /* Set up key for the SSL engine */ + for (i = 0; i < 16; i++) { + keyssl[i] = key1[i]; + keyssl[i + 16] = key2[i]; + } + + /* Encrypt and compare decrypted output */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt); + if (memcmp(dt, refdt, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + perf_stop(&stop); + printf("aes_xts_128_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt); + perf_stop(&stop); + printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c new file mode 100644 index 000000000..fdaa8a9bb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c @@ -0,0 +1,125 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt; + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + printf("aes_xts_128_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Decode perf test */ + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + } + + perf_stop(&stop); + + printf("aes_xts_128_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_128(key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(key2, expkey2_enc, null_key); + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, + pt); + } + + perf_stop(&stop); + + printf("aes_xts_128_dec_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c new file mode 100644 index 000000000..69ae2e60e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c @@ -0,0 +1,144 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *refct; + struct perf start, stop; + unsigned char keyssl[32]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_128_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == refct) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (i = 0; i < 16; i++) { + keyssl[i] = key1[i]; + keyssl[i + 16] = key2[i]; + } + + /* Encrypt and compare output */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + if (memcmp(ct, refct, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + perf_stop(&stop); + + printf("aes_xts_128_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + perf_stop(&stop); + + printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c new file mode 100644 index 000000000..166e46652 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c @@ -0,0 +1,123 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct; + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + printf("aes_xts_128_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Encode perf test */ + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + } + + perf_stop(&stop); + + printf("aes_xts_128_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_128(key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(key2, expkey2_enc, null_key); + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, + ct); + } + + perf_stop(&stop); + + printf("aes_xts_128_enc_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c new file mode 100644 index 000000000..27599f0ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "xts_128_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + // Arrays for expanded keys, null_key is a dummy vector (decrypt key not + // needed for the tweak part of the decryption) + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + printf("Can't allocate ciphertext memory\n"); + return -1; + } + // Pre-expand keys (will only use the encryption ones here) + aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key); + + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + // Vectors 1-10 and 15-19 are for the 128 bit code + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + printf("Can't allocate plaintext memory\n"); + return -1; + } + // Pre-expand keys for the decryption + aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key); + + // Note, encryption key is re-used for the tweak decryption step + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + printf(" failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c new file mode 100644 index 000000000..4753d6778 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c @@ -0,0 +1,247 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include +#include + +#define TEST_LEN (1024*1024) +#define TEST_SIZE (4096) +#ifndef RANDOMS +# define RANDOMS 10 +#endif + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int t, n; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt; + + int align, size, min_size; + unsigned char *efence_pt; + unsigned char *efence_ct; + unsigned char *efence_dt; + + unsigned char *origin_pt; + unsigned char *origin_ct; + unsigned char *origin_dt; + + unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11]; + unsigned char key2_exp_tw[16 * 11]; + int i; + + printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + if (memcmp(pt, dt, TEST_LEN)) { + printf("fail\n"); + return -1; + } + putchar('.'); + + // Do tests with random data, keys and message size + for (t = 0; t < RANDOMS; t++) { + n = rand() % (TEST_LEN); + if (n < 17) + continue; + + mk_rand_data(key1, key2, tinit, pt, n); + XTS_AES_128_enc(key2, key1, tinit, n, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, n, ct, dt); + + if (memcmp(pt, dt, n)) { + printf("fail rand %d, size %d\n", t, n); + return -1; + } + putchar('.'); + fflush(0); + } + + // Run tests at end of buffer for Electric Fence + align = 1; + min_size = 16; + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + origin_pt = malloc(TEST_LEN); + origin_ct = malloc(TEST_LEN); + origin_dt = malloc(TEST_LEN); + if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) { + printf("malloc of testsize failed\n"); + return -1; + } + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + for (i = 0; i < 16 * 11; i++) { + key2_exp_tw[i] = rand(); + } + + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c new file mode 100644 index 000000000..065b84465 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c @@ -0,0 +1,271 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" +#include +#include + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 128 +#endif +#define TEST_LOOPS 128 +#define TEST_LEN (1024*1024) +#define LENGTH_SCAN (2*1024) + +/* Generates random data for keys, tweak and plaintext */ +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */ +static inline + int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv) + || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) { + printf("\n Error in openssl encoding of %d bytes\n", len); + return 1; + } + return 0; +} + +/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */ +static inline + int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv) + || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len)) + || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) { + printf("\n Error in openssl decoding of %d bytes\n", len); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt, *refct, *refdt; + unsigned char keyssl[32]; /* SSL takes both keys together */ + unsigned int rand_len, t; + int i, j, k, ret; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + /* Allocate space for input and output buffers */ + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /**************************** LENGTH SCAN TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of various length: ", 2 * 1024); + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) { + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, i, pt, ct); + ret |= openssl_aes_128_xts_enc(ctx, keyssl, tinit, i, pt, refct); + + // Compare + for (ret = 0, j = 0; j < i && ret == 0; j++) { + if (ct[j] != refct[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_128_enc size=%d failed at byte %d!\n", i, j); + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, i, ct, dt); + ret |= openssl_aes_128_xts_dec(ctx, keyssl, tinit, i, refct, refdt); + + for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) { + if (dt[j] != refdt[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_128_dec size=%d failed at byte %d!\n", i, j); + if (0 == i % (LENGTH_SCAN / 16)) + printf("."); + fflush(0); + } + if (ret) + return -1; + printf("Pass\n"); + + /**************************** FIXED LENGTH TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN); + + // Loop over the vectors + for (i = 0; i < TEST_LOOPS; i++) { + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < TEST_LEN; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_128_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt)) + return -1; + + for (j = 0; j < TEST_LEN; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_128_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == i % (TEST_LOOPS / 16)) + printf("."); + fflush(0); + } + printf("Pass\n"); + + /**************************** RANDOM LENGTH TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS); + + /* Run tests with random size */ + + for (t = 0; t < RANDOMS; t++) { + + rand_len = rand() % (TEST_LEN); + rand_len = rand_len < 16 ? 16 : rand_len; + mk_rand_data(key1, key2, tinit, pt, rand_len); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct); + if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < rand_len; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_128_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt); + if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt)) + return -1; + + for (j = 0; j < rand_len; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_128_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == t % (RANDOMS / 16)) + printf("."); + fflush(0); + } + + EVP_CIPHER_CTX_free(ctx); + + printf("Pass\n"); + + printf("aes_xts_128_rand_ossl: All tests passed\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c new file mode 100644 index 000000000..5dd57e33c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c @@ -0,0 +1,106 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "xts_128_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return -1; + } + + XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + // Vectors 1-10 and 15-19 are for the 128 bit code + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return -1; + } + + XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + // Carry out comparison of the calculated ciphertext with + // the reference + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + + printf(" failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h new file mode 100644 index 000000000..fce792dc7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h @@ -0,0 +1,1691 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" + +#define NVEC 14 + +// struct to hold pointers to the key, plaintext and ciphertext vectors +struct xts_vector { + uint64_t ptlen; // length of our plaintext + uint8_t *key1; // dimension 16 for 128 bit aes + uint8_t *key2; // dimension 16 for 128 bit aes + uint8_t *TW; // dimension 16 for both 128 and 256 bit + uint8_t *PTX; // min. dimension 16 + uint8_t *CTX; // same dimension as PTX +}; + +/* Define our test vectors statically here. Test vectors are from the standard: + * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented + * Storage Devices" + * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450 + * + * Vector 1 + * Key1 00000000000000000000000000000000 + * Key2 00000000000000000000000000000000 + * Data Unit Sequence number 0 + * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit + * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c + * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e + * Plaintext length (bytes): 32 + */ + +static uint8_t v1_key1[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_key2[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_TW[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_PTX[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_CTX[32] = { + 0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec, + 0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92, + 0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85, + 0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e +}; + +/* + * Vector 2 + * Key1 11111111111111111111111111111111 + * Key2 22222222222222222222222222222222 + * Data Unit Sequence number 3333333333 + * PTX 4444444444444444444444444444444444444444444444444444444444444444 + * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d + * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0 + * Plaintext length (bytes): 32 + */ + +static uint8_t v2_key1[16] = { + 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 +}; + +static uint8_t v2_key2[16] = { + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22 +}; + +static uint8_t v2_TW[16] = { + 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v2_PTX[32] = { + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44 +}; + +static uint8_t v2_CTX[32] = { + 0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e, + 0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b, + 0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4, + 0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0 +}; + +/* + * Vector 3 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 22222222222222222222222222222222 + * Data Unit Sequence number 3333333333 + * PTX 4444444444444444444444444444444444444444444444444444444444444444 + * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d + * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89 + * Plaintext length (bytes): 32 + */ + +static uint8_t v3_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v3_key2[16] = { + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22 +}; + +static uint8_t v3_TW[16] = { + 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v3_PTX[32] = { + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44 +}; + +static uint8_t v3_CTX[32] = { + 0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a, + 0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2, + 0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53, + 0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89 +}; + +/* + * Vector 4 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence number 0 + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c + * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 + * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce + * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265 + * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8 + * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434 + * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c + * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e + * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc + * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3 + * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344 + * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd + * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752 + * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e + * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d + * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568 + * Plaintext length (bytes): 512 + */ +static uint8_t v4_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v4_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v4_TW[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v4_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v4_CTX[512] = { + 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76, + 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2, + 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25, + 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c, + 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f, + 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00, + 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad, + 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12, + 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5, + 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5, + 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc, + 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce, + 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4, + 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84, + 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a, + 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65, + 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89, + 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51, + 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15, + 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8, + 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed, + 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91, + 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e, + 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34, + 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b, + 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5, + 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4, + 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c, + 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd, + 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3, + 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f, + 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e, + 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91, + 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19, + 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1, + 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc, + 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed, + 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde, + 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98, + 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3, + 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca, + 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6, + 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc, + 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44, + 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0, + 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95, + 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4, + 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd, + 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13, + 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7, + 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a, + 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52, + 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a, + 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38, + 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e, + 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e, + 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad, + 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8, + 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c, + 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d, + 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f, + 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2, + 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea, + 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68 +}; + +/* + * Vector 5 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number 01 + * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c + * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 + * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce + * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265 + * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8 + * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434 + * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c + * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e + * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc + * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3 + * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344 + * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd + * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752 + * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e + * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d + * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568 + * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5 + * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb + * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f + * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501 + * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99 + * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407 + * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3 + * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb + * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd + * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114 + * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed + * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62 + * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284 + * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32 + * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c + * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd + * Plaintext length (bytes): 512 + */ + +static uint8_t v5_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v5_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v5_TW[16] = { + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v5_PTX[512] = { + 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76, + 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2, + 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25, + 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c, + 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f, + 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00, + 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad, + 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12, + 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5, + 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5, + 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc, + 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce, + 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4, + 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84, + 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a, + 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65, + 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89, + 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51, + 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15, + 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8, + 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed, + 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91, + 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e, + 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34, + 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b, + 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5, + 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4, + 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c, + 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd, + 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3, + 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f, + 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e, + 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91, + 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19, + 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1, + 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc, + 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed, + 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde, + 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98, + 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3, + 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca, + 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6, + 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc, + 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44, + 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0, + 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95, + 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4, + 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd, + 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13, + 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7, + 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a, + 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52, + 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a, + 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38, + 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e, + 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e, + 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad, + 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8, + 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c, + 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d, + 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f, + 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2, + 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea, + 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68 +}; + +static uint8_t v5_CTX[512] = { + 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe, + 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f, + 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60, + 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5, + 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d, + 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce, + 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b, + 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb, + 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49, + 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3, + 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28, + 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f, + 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca, + 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2, + 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa, + 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01, + 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43, + 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d, + 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58, + 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99, + 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3, + 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f, + 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e, + 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07, + 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a, + 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52, + 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c, + 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3, + 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16, + 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b, + 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6, + 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb, + 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5, + 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0, + 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44, + 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd, + 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb, + 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1, + 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf, + 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14, + 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7, + 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d, + 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c, + 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed, + 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7, + 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97, + 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32, + 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62, + 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75, + 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce, + 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04, + 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84, + 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43, + 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95, + 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4, + 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32, + 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab, + 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2, + 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d, + 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c, + 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4, + 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d, + 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a, + 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd +}; + +/* + * Vector 6 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number 02 + * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5 + * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb + * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f + * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501 + * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99 + * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407 + * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3 + * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb + * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd + * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114 + * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed + * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62 + * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284 + * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32 + * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c + * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd + * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e + * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9 + * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62 + * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7 + * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3 + * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec + * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791 + * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f + * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54 + * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c + * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105 + * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159 + * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab + * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60 + * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42 + * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6 + * Plaintext length (bytes): 512 + */ +static uint8_t v6_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v6_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v6_TW[16] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v6_PTX[512] = { + + 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe, + 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f, + 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60, + 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5, + 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d, + 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce, + 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b, + 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb, + 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49, + 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3, + 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28, + 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f, + 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca, + 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2, + 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa, + 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01, + 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43, + 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d, + 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58, + 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99, + 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3, + 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f, + 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e, + 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07, + 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a, + 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52, + 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c, + 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3, + 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16, + 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b, + 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6, + 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb, + 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5, + 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0, + 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44, + 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd, + 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb, + 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1, + 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf, + 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14, + 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7, + 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d, + 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c, + 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed, + 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7, + 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97, + 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32, + 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62, + 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75, + 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce, + 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04, + 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84, + 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43, + 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95, + 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4, + 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32, + 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab, + 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2, + 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d, + 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c, + 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4, + 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d, + 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a, + 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd +}; + +static uint8_t v6_CTX[512] = { + + 0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07, + 0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56, + 0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16, + 0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e, + 0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07, + 0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e, + 0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6, + 0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9, + 0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14, + 0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11, + 0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e, + 0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62, + 0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc, + 0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63, + 0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06, + 0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7, + 0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c, + 0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c, + 0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1, + 0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3, + 0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0, + 0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00, + 0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2, + 0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec, + 0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f, + 0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58, + 0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28, + 0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91, + 0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82, + 0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e, + 0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71, + 0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f, + 0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9, + 0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0, + 0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17, + 0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54, + 0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09, + 0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d, + 0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc, + 0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c, + 0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a, + 0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d, + 0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce, + 0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05, + 0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24, + 0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e, + 0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89, + 0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59, + 0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb, + 0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab, + 0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0, + 0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab, + 0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77, + 0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02, + 0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec, + 0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60, + 0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae, + 0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05, + 0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd, + 0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42, + 0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43, + 0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01, + 0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25, + 0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6 +}; + +/* + * Vector 7 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number fd + * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d + * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a + * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3 + * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46 + * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026 + * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18 + * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f + * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338 + * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641 + * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80 + * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b + * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a + * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80 + * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72 + * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292 + * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888 + * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3 + * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22 + * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce + * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c + * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8 + * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b + * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82 + * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969 + * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5 + * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b + * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec + * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d + * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef + * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280 + * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3 + * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637 + * Plaintext length (bytes): 512 + */ +static uint8_t v7_key1[16] = { + + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v7_key2[16] = { + + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v7_TW[16] = { + + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v7_PTX[512] = { + + 0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9, + 0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6, + 0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40, + 0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d, + 0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91, + 0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46, + 0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44, + 0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a, + 0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07, + 0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3, + 0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51, + 0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3, + 0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54, + 0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56, + 0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d, + 0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46, + 0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0, + 0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61, + 0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2, + 0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26, + 0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56, + 0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39, + 0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0, + 0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18, + 0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4, + 0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d, + 0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf, + 0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f, + 0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41, + 0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72, + 0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6, + 0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38, + 0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13, + 0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae, + 0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c, + 0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41, + 0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84, + 0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca, + 0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd, + 0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80, + 0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e, + 0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6, + 0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06, + 0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b, + 0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4, + 0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3, + 0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1, + 0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a, + 0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f, + 0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55, + 0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9, + 0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80, + 0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9, + 0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae, + 0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81, + 0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72, + 0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4, + 0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8, + 0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2, + 0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92, + 0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77, + 0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61, + 0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed, + 0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88 +}; + +static uint8_t v7_CTX[512] = { + + 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e, + 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a, + 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9, + 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3, + 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07, + 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3, + 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08, + 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22, + 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c, + 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f, + 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd, + 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce, + 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f, + 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5, + 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc, + 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c, + 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3, + 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b, + 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec, + 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8, + 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee, + 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b, + 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3, + 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b, + 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd, + 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c, + 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2, + 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82, + 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d, + 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd, + 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6, + 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69, + 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e, + 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7, + 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53, + 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5, + 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58, + 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32, + 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a, + 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b, + 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17, + 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45, + 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54, + 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec, + 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0, + 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39, + 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66, + 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d, + 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83, + 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31, + 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03, + 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef, + 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18, + 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1, + 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65, + 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80, + 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23, + 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f, + 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49, + 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3, + 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78, + 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1, + 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc, + 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37 +}; + +/* + * Vector 8 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number fe + * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3 + * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22 + * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce + * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c + * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8 + * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b + * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82 + * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969 + * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5 + * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b + * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec + * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d + * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef + * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280 + * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3 + * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637 + * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395 + * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae + * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94 + * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0 + * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd + * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d + * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792 + * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e + * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18 + * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b + * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb + * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a + * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d + * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4 + * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1 + * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a + * Plaintext length (bytes): 512 + */ +static uint8_t v8_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v8_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v8_TW[16] = { + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v8_PTX[512] = { + 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e, + 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a, + 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9, + 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3, + 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07, + 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3, + 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08, + 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22, + 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c, + 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f, + 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd, + 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce, + 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f, + 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5, + 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc, + 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c, + 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3, + 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b, + 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec, + 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8, + 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee, + 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b, + 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3, + 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b, + 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd, + 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c, + 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2, + 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82, + 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d, + 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd, + 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6, + 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69, + 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e, + 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7, + 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53, + 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5, + 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58, + 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32, + 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a, + 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b, + 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17, + 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45, + 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54, + 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec, + 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0, + 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39, + 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66, + 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d, + 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83, + 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31, + 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03, + 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef, + 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18, + 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1, + 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65, + 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80, + 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23, + 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f, + 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49, + 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3, + 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78, + 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1, + 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc, + 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37 +}; + +static uint8_t v8_CTX[512] = { + 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59, + 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda, + 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44, + 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95, + 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1, + 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2, + 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e, + 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae, + 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e, + 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a, + 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27, + 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94, + 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c, + 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32, + 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18, + 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0, + 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d, + 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d, + 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf, + 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd, + 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1, + 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72, + 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb, + 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d, + 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6, + 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b, + 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58, + 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92, + 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf, + 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a, + 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29, + 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e, + 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28, + 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77, + 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87, + 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18, + 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1, + 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd, + 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67, + 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b, + 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde, + 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75, + 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a, + 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb, + 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32, + 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11, + 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65, + 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a, + 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca, + 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24, + 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e, + 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d, + 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05, + 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06, + 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9, + 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4, + 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11, + 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c, + 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85, + 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1, + 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11, + 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c, + 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef, + 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a +}; + +/* + * Vector 9 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number ff + * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395 + * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae + * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94 + * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0 + * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd + * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d + * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792 + * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e + * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18 + * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b + * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb + * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a + * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d + * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4 + * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1 + * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a + * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553 + * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a + * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca + * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58 + * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542 + * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a + * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9 + * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46 + * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037 + * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f + * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734 + * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82 + * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3 + * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf + * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8 + * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd + * Plaintext length (bytes): 512 + */ +static uint8_t v9_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v9_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v9_TW[16] = { + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v9_PTX[512] = { + 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59, + 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda, + 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44, + 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95, + 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1, + 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2, + 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e, + 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae, + 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e, + 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a, + 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27, + 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94, + 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c, + 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32, + 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18, + 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0, + 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d, + 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d, + 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf, + 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd, + 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1, + 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72, + 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb, + 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d, + 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6, + 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b, + 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58, + 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92, + 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf, + 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a, + 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29, + 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e, + 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28, + 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77, + 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87, + 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18, + 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1, + 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd, + 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67, + 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b, + 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde, + 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75, + 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a, + 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb, + 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32, + 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11, + 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65, + 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a, + 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca, + 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24, + 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e, + 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d, + 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05, + 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06, + 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9, + 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4, + 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11, + 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c, + 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85, + 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1, + 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11, + 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c, + 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef, + 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a +}; + +static uint8_t v9_CTX[512] = { + 0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32, + 0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54, + 0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa, + 0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53, + 0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e, + 0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c, + 0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59, + 0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a, + 0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce, + 0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea, + 0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6, + 0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca, + 0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee, + 0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b, + 0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d, + 0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58, + 0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a, + 0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d, + 0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc, + 0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42, + 0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d, + 0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e, + 0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1, + 0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a, + 0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f, + 0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9, + 0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90, + 0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9, + 0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb, + 0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2, + 0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58, + 0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46, + 0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2, + 0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0, + 0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f, + 0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37, + 0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f, + 0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea, + 0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b, + 0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f, + 0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29, + 0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c, + 0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d, + 0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34, + 0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad, + 0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b, + 0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2, + 0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82, + 0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc, + 0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6, + 0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb, + 0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3, + 0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17, + 0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87, + 0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31, + 0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf, + 0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3, + 0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06, + 0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44, + 0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8, + 0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08, + 0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf, + 0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65, + 0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd +}; + +/* + * Vector 15 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f10 + * CTX 6c1625db4671522d3d7599601de7ca09ed + * Plaintext length (bytes): 17 + */ + +static uint8_t v15_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v15_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v15_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v15_PTX[17] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10 +}; + +static uint8_t v15_CTX[17] = { + 0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d, + 0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09, + 0xed +}; + +/* + * Vector 16 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f1011 + * CTX d069444b7a7e0cab09e24447d24deb1fedbf + * Plaintext length (bytes): 18 + */ +static uint8_t v16_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v16_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v16_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v16_PTX[18] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11 +}; + +static uint8_t v16_CTX[18] = { + 0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab, + 0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f, + 0xed, 0xbf +}; + +/* + * Vector 17 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f101112 + * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d + * Plaintext length (bytes): 19 + */ + +static uint8_t v17_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v17_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v17_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v17_PTX[19] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12 +}; + +static uint8_t v17_CTX[19] = { + 0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1, + 0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe, + 0xed, 0xbf, 0x9d +}; + +/* + * Vector 18 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f10111213 + * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac + * Plaintext length (bytes): 20 + */ + +static uint8_t v18_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v18_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v18_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v18_PTX[20] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13 +}; + +static uint8_t v18_CTX[20] = { + 0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c, + 0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2, + 0xed, 0xbf, 0x9d, 0xac +}; + +/* + * Vector 19 + * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef + * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf + * Data unit sequence number 21436587a9 + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6 + * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc + * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854 + * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177 + * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3 + * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4 + * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe + * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d + * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64 + * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272 + * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648 + * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394 + * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2 + * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a + * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf + * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3 + * Plaintext length (bytes): 512 + */ +static uint8_t v19_key1[16] = { + + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef +}; + +static uint8_t v19_key2[16] = { + + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf +}; + +static uint8_t v19_TW[16] = { + + 0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v19_PTX[512] = { + + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v19_CTX[512] = { + 0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b, + 0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b, + 0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a, + 0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6, + 0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8, + 0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f, + 0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf, + 0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc, + 0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff, + 0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4, + 0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09, + 0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54, + 0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e, + 0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc, + 0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33, + 0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77, + 0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5, + 0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e, + 0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71, + 0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3, + 0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1, + 0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd, + 0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e, + 0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4, + 0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04, + 0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41, + 0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4, + 0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe, + 0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06, + 0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34, + 0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0, + 0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d, + 0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f, + 0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec, + 0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78, + 0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64, + 0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26, + 0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49, + 0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1, + 0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72, + 0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41, + 0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b, + 0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49, + 0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48, + 0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda, + 0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48, + 0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9, + 0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94, + 0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11, + 0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47, + 0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b, + 0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2, + 0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34, + 0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae, + 0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde, + 0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a, + 0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b, + 0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0, + 0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85, + 0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf, + 0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61, + 0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb, + 0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90, + 0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3 +}; + +// Define vector of structs, with pointers to the statically defined vectors + +struct xts_vector vlist[NVEC] = { + + // pointers to the statically defined vectors here + + // Vector 1 + {sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX} + , + // Vector 2 + {sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX} + , + // Vector 3 + {sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX} + , + // Vector 4 + {sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX} + , + // Vector 5 + {sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX} + , + // Vector 6 + {sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX} + , + // Vector 7 + {sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX} + , + // Vector 8 + {sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX} + , + // Vector 9 + {sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX} + , + // Vector 15 + {sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX} + , + // Vector 16 + {sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX} + , + // Vector 17 + {sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX} + , + // Vector 18 + {sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX} + , + // Vector 19 + {sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX} + +}; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c new file mode 100644 index 000000000..5bccd4a5c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt, *refdt; + struct perf start, stop; + unsigned char keyssl[64]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_256_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + /* Set up key for the SSL engine */ + for (i = 0; i < 32; i++) { + keyssl[i] = key1[i]; + keyssl[i + 32] = key2[i]; + } + + /* Encrypt and compare decrypted output */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt); + if (memcmp(dt, refdt, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + perf_stop(&stop); + printf("aes_xts_256_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt); + perf_stop(&stop); + printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c new file mode 100644 index 000000000..ff3d62e93 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c @@ -0,0 +1,126 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt; + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + printf("aes_xts_256_dec_perf:\n"); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Decode perf test */ + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + } + + perf_stop(&stop); + + printf("aes_xts_256_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_256(key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(key2, expkey2_enc, null_key); + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, + pt); + } + + perf_stop(&stop); + + printf("aes_xts_256_dec_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c new file mode 100644 index 000000000..8d477ca89 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *refct; + struct perf start, stop; + unsigned char keyssl[64]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_256_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == refct) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (i = 0; i < 32; i++) { + keyssl[i] = key1[i]; + keyssl[i + 32] = key2[i]; + } + + /* Encrypt and compare output */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + if (memcmp(ct, refct, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + perf_stop(&stop); + + printf("aes_xts_256_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + perf_stop(&stop); + + printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c new file mode 100644 index 000000000..051dd0a0e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c @@ -0,0 +1,124 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct; + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + printf("aes_xts_256_enc_perf:\n"); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Encode perf test */ + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + } + + perf_stop(&stop); + + printf("aes_xts_256_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_256(key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(key2, expkey2_enc, null_key); + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, + ct); + } + + perf_stop(&stop); + + printf("aes_xts_256_enc_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c new file mode 100644 index 000000000..c8d664a8b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c @@ -0,0 +1,113 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "xts_256_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + // Arrays for expanded keys, null_key is a dummy vector (decrypt key not + // needed for the tweak part of the decryption) + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + printf("Can't allocate ciphertext memory\n"); + return -1; + } + // Pre-expand our keys (will only use the encryption ones here) + aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key); + + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + printf("\nXTS_AES_256_enc: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated plaintext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + printf("Can't allocate plaintext memory\n"); + return -1; + } + // Pre-expand keys for the decryption + aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key); + + // Note, encryption key is re-used for the tweak decryption step + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_256_dec: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c new file mode 100644 index 000000000..5ad7359cc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include // for rand +#include // for memcmp +#include +#include + +#define TEST_LEN (1024*1024) +#define TEST_SIZE (4096) +#ifndef RANDOMS +# define RANDOMS 10 +#endif + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int t, n; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt; + + int align, size, min_size; + unsigned char *efence_pt; + unsigned char *efence_ct; + unsigned char *efence_dt; + + unsigned char *origin_pt; + unsigned char *origin_ct; + unsigned char *origin_dt; + + unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15]; + unsigned char key2_exp_tw[16 * 15]; + int i; + printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + if (memcmp(pt, dt, TEST_LEN)) { + printf("fail\n"); + return -1; + } + putchar('.'); + + // Do tests with random data, keys and message size + for (t = 0; t < RANDOMS; t++) { + n = rand() % (TEST_LEN); + if (n < 17) + continue; + + xts256_mk_rand_data(key1, key2, tinit, pt, n); + XTS_AES_256_enc(key2, key1, tinit, n, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, n, ct, dt); + + if (memcmp(pt, dt, n)) { + printf("fail rand %d, size %d\n", t, n); + return -1; + } + putchar('.'); + fflush(0); + } + + // Run tests at end of buffer for Electric Fence + align = 1; + min_size = 16; + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + origin_pt = malloc(TEST_LEN); + origin_ct = malloc(TEST_LEN); + origin_dt = malloc(TEST_LEN); + if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) { + printf("malloc of testsize failed\n"); + return -1; + } + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + for (i = 0; i < 16 * 15; i++) { + key2_exp_tw[i] = rand(); + } + + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + + putchar('.'); + fflush(0); + } + + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c new file mode 100644 index 000000000..6b25277dc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c @@ -0,0 +1,273 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" +#include +#include + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 128 +#endif +#define TEST_LOOPS 128 +#define TEST_LEN (1024*1024) +#define LENGTH_SCAN (2*1024) + +/* Generates random data for keys, tweak and plaintext */ +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */ +static inline + int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv) + || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) { + printf("\n Error in openssl encoding of %d bytes\n", len); + return 1; + } + return 0; +} + +/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */ +static inline + int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv) + || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len)) + || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) { + printf("\n Error in openssl decoding of %d bytes\n", len); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + + unsigned char key1[32], key2[32], tinit[16]; + unsigned char *pt, *ct, *dt, *refct, *refdt; + unsigned char keyssl[64]; /* SSL takes both keys together */ + int i, j, k, ret; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + /* Allocate space for input and output buffers */ + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /**************************** LENGTH SCAN TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of various length: ", 2 * 1024); + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) { + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, i, pt, ct); + ret |= openssl_aes_256_xts_enc(ctx, keyssl, tinit, i, pt, refct); + + // Compare + for (ret = 0, j = 0; j < i && ret == 0; j++) { + if (ct[j] != refct[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_256_enc size=%d failed at byte %d!\n", i, j); + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, i, ct, dt); + ret |= openssl_aes_256_xts_dec(ctx, keyssl, tinit, i, refct, refdt); + + for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) { + if (dt[j] != refdt[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_256_dec size=%d failed at byte %d!\n", i, j); + if (0 == i % (LENGTH_SCAN / 16)) + printf("."); + fflush(0); + } + if (ret) + return -1; + printf("Pass\n"); + + /**************************** FIXED LENGTH TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN); + + /* Loop over the vectors */ + for (i = 0; i < TEST_LOOPS; i++) { + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct)) + return -1; + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < TEST_LEN; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_256_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt)) + return -1; + + for (j = 0; j < TEST_LEN; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_256_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == i % (TEST_LOOPS / 16)) + printf("."); + fflush(0); + } + printf("Pass\n"); + + /**************************** RANDOM LENGTH TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS); + + /* Run tests with random size */ + + unsigned int rand_len, t; + + for (t = 0; t < RANDOMS; t++) { + + rand_len = rand() % (TEST_LEN); + rand_len = rand_len < 16 ? 16 : rand_len; + xts256_mk_rand_data(key1, key2, tinit, pt, rand_len); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct); + if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < rand_len; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_256_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt); + if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt)) + return -1; + + for (j = 0; j < rand_len; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_256_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == t % (RANDOMS / 16)) + printf("."); + fflush(0); + } + + EVP_CIPHER_CTX_free(ctx); + + printf("Pass\n"); + + printf("aes_xts_256_rand_ossl: All tests passed\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c new file mode 100644 index 000000000..2c961f44f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c @@ -0,0 +1,105 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "xts_256_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return -1; + } + + XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + printf("\nXTS_AES_256_enc: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + + ct_test = NULL; + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return -1; + } + + XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_256_dec: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + + pt_test = NULL; + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h new file mode 100644 index 000000000..5a893f173 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h @@ -0,0 +1,1035 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" + +#define NVEC 5 + +// struct to hold pointers to the key, plaintext and ciphertext vectors +struct xts_vector { + uint64_t ptlen; // length of our plaintext + uint8_t *key1; // dimension 16 for 128 bit aes + uint8_t *key2; // dimension 16 for 128 bit aes + uint8_t *TW; // dimension 16 for both 128 and 256 bit + uint8_t *PTX; // min. dimension 16 + uint8_t *CTX; // same dimension as PTX +}; + +/* Define our test vectors statically here. Test vectors are from the standard: + * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented + * Storage Devices" + * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450 + * + * Vector 10 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b + * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd + * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0 + * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca + * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0 + * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f + * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec + * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a + * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1 + * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae + * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29 + * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac + * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f + * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385 + * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa + * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151 + * Plaintext length (bytes): 512 + */ + +static uint8_t v10_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v10_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v10_TW[16] = { + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v10_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v10_CTX[512] = { + 0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86, + 0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b, + 0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57, + 0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b, + 0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d, + 0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44, + 0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33, + 0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd, + 0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e, + 0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70, + 0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2, + 0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0, + 0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89, + 0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75, + 0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10, + 0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca, + 0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03, + 0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d, + 0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd, + 0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0, + 0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54, + 0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34, + 0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c, + 0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f, + 0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef, + 0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a, + 0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50, + 0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec, + 0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96, + 0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6, + 0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb, + 0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a, + 0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70, + 0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f, + 0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60, + 0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1, + 0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37, + 0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84, + 0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51, + 0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae, + 0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe, + 0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b, + 0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f, + 0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29, + 0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72, + 0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52, + 0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91, + 0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac, + 0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04, + 0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0, + 0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00, + 0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f, + 0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33, + 0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93, + 0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed, + 0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85, + 0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2, + 0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a, + 0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97, + 0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa, + 0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09, + 0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54, + 0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70, + 0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51 +}; + +/* + * Vector 11 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11 + * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2 + * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d + * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201 + * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179 + * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a + * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6 + * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80 + * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a + * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0 + * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75 + * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63 + * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a + * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68 + * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090 + * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803 + * Plaintext length (bytes): 512 + * +*/ +static uint8_t v11_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v11_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v11_TW[16] = { + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v11_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v11_CTX[512] = { + 0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6, + 0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50, + 0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02, + 0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11, + 0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24, + 0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4, + 0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2, + 0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2, + 0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b, + 0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7, + 0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac, + 0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d, + 0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b, + 0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77, + 0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7, + 0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01, + 0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8, + 0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32, + 0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42, + 0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79, + 0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49, + 0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc, + 0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41, + 0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a, + 0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d, + 0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad, + 0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60, + 0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6, + 0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb, + 0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec, + 0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce, + 0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80, + 0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3, + 0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9, + 0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87, + 0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a, + 0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31, + 0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39, + 0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14, + 0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0, + 0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d, + 0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1, + 0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05, + 0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75, + 0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61, + 0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66, + 0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9, + 0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63, + 0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c, + 0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08, + 0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab, + 0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a, + 0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61, + 0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15, + 0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd, + 0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68, + 0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c, + 0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24, + 0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85, + 0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90, + 0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79, + 0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94, + 0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00, + 0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03 +}; + +/* + * Vector 12 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200 + * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049 + * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8 + * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1 + * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf + * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e + * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6 + * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879 + * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0 + * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964 + * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d + * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a + * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2 + * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace + * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36 + * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v12_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v12_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v12_TW[16] = { + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v12_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v12_CTX[512] = { + 0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf, + 0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e, + 0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac, + 0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00, + 0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80, + 0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81, + 0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b, + 0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49, + 0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63, + 0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d, + 0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f, + 0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8, + 0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a, + 0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6, + 0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59, + 0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1, + 0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a, + 0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26, + 0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce, + 0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf, + 0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79, + 0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50, + 0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef, + 0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e, + 0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce, + 0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7, + 0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd, + 0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6, + 0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f, + 0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87, + 0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd, + 0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79, + 0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3, + 0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0, + 0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b, + 0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0, + 0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6, + 0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97, + 0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5, + 0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64, + 0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b, + 0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97, + 0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f, + 0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d, + 0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e, + 0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4, + 0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69, + 0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a, + 0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e, + 0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a, + 0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37, + 0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2, + 0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd, + 0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11, + 0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb, + 0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce, + 0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7, + 0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a, + 0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95, + 0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36, + 0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9, + 0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13, + 0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88, + 0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26 +}; + +/* + * Vector 13 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5 + * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1 + * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac + * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e + * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3 + * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5 + * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9 + * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c + * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053 + * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7 + * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df + * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00 + * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460 + * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155 + * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c + * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v13_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v13_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v13_TW[16] = { + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v13_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v13_CTX[512] = { + 0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22, + 0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0, + 0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48, + 0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5, + 0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44, + 0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a, + 0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18, + 0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1, + 0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9, + 0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47, + 0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd, + 0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac, + 0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5, + 0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e, + 0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a, + 0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e, + 0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16, + 0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c, + 0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47, + 0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3, + 0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68, + 0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2, + 0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44, + 0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5, + 0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1, + 0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23, + 0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c, + 0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9, + 0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d, + 0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff, + 0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67, + 0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c, + 0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8, + 0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79, + 0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b, + 0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53, + 0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b, + 0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2, + 0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7, + 0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7, + 0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6, + 0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d, + 0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e, + 0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf, + 0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84, + 0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32, + 0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c, + 0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00, + 0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78, + 0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a, + 0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f, + 0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60, + 0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4, + 0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16, + 0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a, + 0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55, + 0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c, + 0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55, + 0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd, + 0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c, + 0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3, + 0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8, + 0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4, + 0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20 +}; + +/* + * Vector 14 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23 + * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80 + * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604 + * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4 + * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b + * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434 + * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8 + * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff + * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b + * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6 + * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa + * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc + * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a + * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546 + * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de + * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v14_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v14_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v14_TW[16] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v14_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v14_CTX[512] = { + 0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93, + 0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d, + 0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d, + 0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23, + 0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0, + 0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a, + 0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46, + 0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80, + 0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1, + 0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91, + 0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94, + 0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04, + 0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf, + 0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f, + 0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51, + 0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4, + 0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28, + 0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef, + 0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c, + 0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b, + 0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79, + 0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c, + 0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba, + 0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34, + 0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11, + 0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c, + 0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56, + 0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8, + 0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5, + 0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48, + 0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73, + 0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff, + 0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd, + 0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb, + 0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48, + 0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b, + 0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2, + 0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54, + 0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac, + 0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6, + 0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26, + 0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42, + 0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f, + 0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa, + 0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50, + 0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f, + 0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6, + 0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc, + 0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00, + 0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf, + 0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7, + 0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a, + 0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22, + 0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72, + 0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb, + 0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46, + 0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf, + 0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23, + 0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50, + 0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde, + 0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5, + 0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb, + 0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59, + 0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9 +}; + +// +// Define vector of structs, with pointers to the statically defined vectors + +struct xts_vector vlist[NVEC] = { + + // pointers to the statically defined vectors here + + // Vector 10 + {sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX} + , + // Vector 11 + {sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX} + , + // Vector 12 + {sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX} + , + // Vector 13 + {sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX} + , + // Vector 14 + {sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX} + +}; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm new file mode 100644 index 000000000..416da1e7b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm @@ -0,0 +1,78 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern XTS_AES_128_enc_sse +extern XTS_AES_128_enc_avx + +extern XTS_AES_128_enc_expanded_key_sse +extern XTS_AES_128_enc_expanded_key_avx + +extern XTS_AES_128_dec_sse +extern XTS_AES_128_dec_avx + +extern XTS_AES_128_dec_expanded_key_sse +extern XTS_AES_128_dec_expanded_key_avx + +%if (AS_FEATURE_LEVEL) >= 10 +extern XTS_AES_128_enc_vaes +extern XTS_AES_128_enc_expanded_key_vaes +extern XTS_AES_128_dec_vaes +extern XTS_AES_128_dec_expanded_key_vaes +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key +;;;; +mbin_interface XTS_AES_128_enc +mbin_dispatch_init7 XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_vaes + +mbin_interface XTS_AES_128_enc_expanded_key +mbin_dispatch_init7 XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_vaes + +mbin_interface XTS_AES_128_dec +mbin_dispatch_init7 XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_vaes + +mbin_interface XTS_AES_128_dec_expanded_key +mbin_dispatch_init7 XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_vaes + + +;;; func core, ver, snum +slversion XTS_AES_128_enc, 01, 04, 0071 +slversion XTS_AES_128_enc_expanded_key, 01, 04, 0072 +slversion XTS_AES_128_dec, 01, 04, 0073 +slversion XTS_AES_128_dec_expanded_key, 01, 04, 0074 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm new file mode 100644 index 000000000..33f376d5c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm @@ -0,0 +1,78 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern XTS_AES_256_enc_sse +extern XTS_AES_256_enc_avx + +extern XTS_AES_256_enc_expanded_key_sse +extern XTS_AES_256_enc_expanded_key_avx + +extern XTS_AES_256_dec_sse +extern XTS_AES_256_dec_avx + +extern XTS_AES_256_dec_expanded_key_sse +extern XTS_AES_256_dec_expanded_key_avx + +%if (AS_FEATURE_LEVEL) >= 10 +extern XTS_AES_256_enc_vaes +extern XTS_AES_256_enc_expanded_key_vaes +extern XTS_AES_256_dec_vaes +extern XTS_AES_256_dec_expanded_key_vaes +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key +;;;; +mbin_interface XTS_AES_256_enc +mbin_dispatch_init7 XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_vaes + +mbin_interface XTS_AES_256_enc_expanded_key +mbin_dispatch_init7 XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_vaes + +mbin_interface XTS_AES_256_dec +mbin_dispatch_init7 XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_vaes + +mbin_interface XTS_AES_256_dec_expanded_key +mbin_dispatch_init7 XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_vaes + + +;;; func core, ver, snum +slversion XTS_AES_256_enc, 01, 04, 0076 +slversion XTS_AES_256_enc_expanded_key, 01, 04, 0077 +slversion XTS_AES_256_dec, 01, 04, 0078 +slversion XTS_AES_256_dec_expanded_key, 01, 04, 0079 diff --git a/src/crypto/isa-l/isa-l_crypto/autogen.sh b/src/crypto/isa-l/isa-l_crypto/autogen.sh new file mode 100755 index 000000000..0a3189383 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/autogen.sh @@ -0,0 +1,17 @@ +#!/bin/sh -e + +autoreconf --install --symlink -f + +libdir() { + echo $(cd $1/$(gcc -print-multi-os-directory); pwd) +} + +args="--prefix=/usr --libdir=$(libdir /usr/lib)" + +echo +echo "----------------------------------------------------------------" +echo "Initialized build system. For a common configuration please run:" +echo "----------------------------------------------------------------" +echo +echo "./configure $args" +echo diff --git a/src/crypto/isa-l/isa-l_crypto/configure.ac b/src/crypto/isa-l/isa-l_crypto/configure.ac new file mode 100644 index 000000000..70f9cc88d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/configure.ac @@ -0,0 +1,349 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.69) +AC_INIT([libisal_crypto], + [2.24.0], + [sg.support.isal@intel.com], + [isa-l_crypto], + [http://01.org/storage-acceleration-library]) +AC_CONFIG_SRCDIR([]) +AC_CONFIG_AUX_DIR([build-aux]) +AM_INIT_AUTOMAKE([ + foreign + 1.11 + -Wall + -Wno-portability + silent-rules + tar-pax + no-dist-gzip + dist-xz + subdir-objects +]) +AM_PROG_AS + +AC_CANONICAL_HOST +CPU="" +AS_CASE([$host_cpu], + [x86_64], [CPU="x86_64"], + [amd64], [CPU="x86_64"], + [i?86], [CPU="x86_32"], + [aarch64], [CPU="aarch64"], + [arm64], [CPU="aarch64"], +) +AM_CONDITIONAL([CPU_X86_64], [test "$CPU" = "x86_64"]) +AM_CONDITIONAL([CPU_X86_32], [test "$CPU" = "x86_32"]) +AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"]) +AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"]) + +if test "$CPU" = "x86_64"; then + is_x86=yes +else + if test "$CPU" = "x86_32"; then + is_x86=yes + else + is_x86=no + fi +fi + +# Check for programs +AC_PROG_CC_STDC +AC_USE_SYSTEM_EXTENSIONS +AM_SILENT_RULES([yes]) +LT_INIT +AC_PREFIX_DEFAULT([/usr]) +AC_PROG_SED +AC_PROG_MKDIR_P + +# Options +AC_ARG_ENABLE([debug], + AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]), + [], [enable_debug=no]) +AS_IF([test "x$enable_debug" = "xyes"], [ + AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.]) +]) +# If this build is for x86, look for yasm and nasm +if test x"$is_x86" = x"yes"; then +AC_MSG_CHECKING([whether Intel CET is enabled]) +AC_TRY_COMPILE([],[ +#ifndef __CET__ +# error CET is not enabled +#endif], + [AC_MSG_RESULT([yes]) + intel_cet_enabled=yes], + [AC_MSG_RESULT([no]) + intel_cet_enabled=no]) + + + # Pick an assembler yasm or nasm + if test x"$AS" = x""; then + # Check for yasm and yasm features + yasm_feature_level=0 + AC_CHECK_PROG(HAVE_YASM, yasm, yes, no) + if test "$HAVE_YASM" = "yes"; then + yasm_feature_level=1 + else + AC_MSG_RESULT([no yasm]) + fi + if test x"$yasm_feature_level" = x"1"; then + AC_MSG_CHECKING([for modern yasm]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])]) + if yasm -f elf64 -p gas conftest.c ; then + AC_MSG_RESULT([yes]) + yasm_feature_level=4 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$yasm_feature_level" = x"4"; then + AC_MSG_CHECKING([for optional yasm AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb %zmm0, %zmm1, %zmm2;]])]) + if yasm -f elf64 -p gas conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + yasm_feature_level=6 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$yasm_feature_level" = x"6"; then + AC_MSG_CHECKING([for additional yasm AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])]) + sed -i -e '/vpcompressb/!d' conftest.c + if yasm -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + yasm_feature_level=10 + else + AC_MSG_RESULT([no]) + fi + fi + + AC_MSG_CHECKING([for optional yasm SHA-NI support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 %xmm0,%xmm1,%xmm2;]])]) + if yasm -f elf64 -p gas conftest.c 2> /dev/null; then + yasm_knows_shani=yes + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + # Check for nasm and nasm features + nasm_feature_level=0 + AC_CHECK_PROG(HAVE_NASM, nasm, yes, no) + if test "$HAVE_NASM" = "yes"; then + nasm_feature_level=1 + else + AC_MSG_RESULT([no nasm]) + fi + + if test x"$nasm_feature_level" = x"1"; then + AC_MSG_CHECKING([for modern nasm]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])]) + sed -i -e '/pblendvb/!d' conftest.c + if nasm -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + nasm_feature_level=4 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$nasm_feature_level" = x"4"; then + AC_MSG_CHECKING([for optional nasm AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])]) + sed -i -e '/vinsert/!d' conftest.c + if nasm -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + nasm_feature_level=6 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$nasm_feature_level" = x"6"; then + AC_MSG_CHECKING([for additional nasm AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0 {k1}, zmm1;]])]) + sed -i -e '/vpcompressb/!d' conftest.c + if nasm -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + nasm_feature_level=10 + else + AC_MSG_RESULT([no]) + fi + fi + + AC_MSG_CHECKING([for optional nasm SHA-NI support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])]) + sed -i -e '/sha256rnds2/!d' conftest.c + if nasm -f elf64 conftest.c 2> /dev/null; then + nasm_knows_shani=yes + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + if test $nasm_feature_level -ge $yasm_feature_level ; then + AS=nasm + as_feature_level=$nasm_feature_level + as_knows_shani=$nasm_knows_shani + else + AS=yasm + as_feature_level=$yasm_feature_level + as_knows_shani=$yasm_knows_shani + fi + + else + # Check for $AS supported features + as_feature_level=0 + AC_CHECK_PROG(HAVE_AS, $AS, yes, no) + if test "$HAVE_AS" = "yes"; then + as_feature_level=1 + else + AC_MSG_ERROR([no $AS]) + fi + + if test x"$as_feature_level" = x"1"; then + AC_MSG_CHECKING([for modern $AS]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])]) + sed -i -e '/pblendvb/!d' conftest.c + if $AS -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + as_feature_level=4 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$as_feature_level" = x"4"; then + AC_MSG_CHECKING([for optional as AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])]) + sed -i -e '/vinsert/!d' conftest.c + if $AS -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + as_feature_level=6 + else + AC_MSG_RESULT([no]) + fi + fi + if test x"$as_feature_level" = x"6"; then + AC_MSG_CHECKING([for additional as AVX512 support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])]) + sed -i -e '/vpcompressb/!d' conftest.c + if $AS -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + as_feature_level=10 + else + AC_MSG_RESULT([no]) + fi + fi + + AC_MSG_CHECKING([for optional nasm SHA-NI support]) + AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])]) + sed -i -e '/sha256rnds2/!d' conftest.c + if $AS -f elf64 conftest.c 2> /dev/null; then + AC_MSG_RESULT([yes]) + as_knows_shani=yes + else + AC_MSG_RESULT([no]) + fi + + fi + + if test $as_feature_level -lt 2 ; then + AC_MSG_ERROR([No modern nasm or yasm found as required. Nasm should be v2.11.01 or later (v2.13 for AVX512) and yasm should be 1.2.0 or later.]) + fi + + if test x"$as_knows_shani" = x"yes"; then + AC_DEFINE(HAVE_AS_KNOWS_SHANI, [1], [Assembler can do SHANI.]) + have_as_knows_shani=yes + else + AC_MSG_RESULT([Assembler does not understand SHANI opcodes. Consider upgrading for best performance.]) + fi + + case $host_os in + *linux*) arch=linux yasm_args="-f elf64";; + *darwin*) arch=darwin yasm_args="-f macho64 --prefix=_ ";; + *netbsd*) arch=netbsd yasm_args="-f elf64";; + *mingw*) arch=mingw yasm_args="-f win64";; + *) arch=unknown yasm_args="-f elf64";; + esac + + # Fix for nasm missing windows features + if test x"$arch" = x"mingw"; then + AS=yasm + as_feature_level=$yasm_feature_level + if test $as_feature_level -lt 2 ; then + AC_MSG_ERROR([Mingw build requires Yasm 1.2.0 or later.]) + fi + fi + + AC_DEFINE_UNQUOTED(AS_FEATURE_LEVEL, [$as_feature_level], [Assembler feature level.]) + if test $as_feature_level -ge 6 ; then + AC_DEFINE(HAVE_AS_KNOWS_AVX512, [1], [Assembler can do AVX512.]) + have_as_knows_avx512=yes + else + AC_MSG_RESULT([Assembler does not understand AVX512 opcodes. Consider upgrading for best performance.]) + fi + + AM_CONDITIONAL(USE_YASM, test x"$AS" = x"yasm") + AM_CONDITIONAL(USE_NASM, test x"$AS" = x"nasm") + AM_CONDITIONAL(WITH_AVX512, test x"$have_as_knows_avx512" = x"yes") + AM_CONDITIONAL(WITH_SHANI, test x"$have_as_knows_shani" = x"yes") + AC_SUBST([yasm_args]) + AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin") + AC_MSG_RESULT([Using $AS args target "$arch" "$yasm_args"]) +else + # Disable below conditionals if not x86 + AM_CONDITIONAL(USE_YASM, test "x" = "y") + AM_CONDITIONAL(USE_NASM, test "x" = "y") + AM_CONDITIONAL(WITH_AVX512, test "x" = "y") + AM_CONDITIONAL(WITH_SHANI, test "x" = "y") + AM_CONDITIONAL(DARWIN, test "x" = "y") +fi + +AM_CONDITIONAL(INTEL_CET_ENABLED, [test x"$intel_cet_enabled" = x"yes"]) + +# Check for header files +AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_C_INLINE +AC_TYPE_SIZE_T +AC_TYPE_UINT16_T +AC_TYPE_UINT32_T +AC_TYPE_UINT64_T +AC_TYPE_UINT8_T + +# Checks for library functions. +AC_FUNC_MALLOC # Used only in tests +AC_CHECK_FUNCS([memmove memset]) + +my_CFLAGS="\ +-Wall \ +-Wchar-subscripts \ +-Wformat-security \ +-Wnested-externs \ +-Wpointer-arith \ +-Wshadow \ +-Wstrict-prototypes \ +-Wtype-limits \ +" +AC_SUBST([my_CFLAGS]) + +AC_CONFIG_FILES([\ + Makefile\ + libisal_crypto.pc +]) + +AC_OUTPUT +AC_MSG_RESULT([ + $PACKAGE $VERSION + ===== + + prefix: ${prefix} + sysconfdir: ${sysconfdir} + libdir: ${libdir} + includedir: ${includedir} + + compiler: ${CC} + cflags: ${CFLAGS} + ldflags: ${LDFLAGS} + + debug: ${enable_debug} +]) diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile new file mode 100644 index 000000000..41e9e29f2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile @@ -0,0 +1,27 @@ + +INCLUDE = /usr/include +CFLAGS = -O2 -I$(INCLUDE) +LDLIBS = -lisal_crypto -lcrypto -lpthread +test = isal_multithread_perf + +source += isal_multithread_perf.c +source += md5_thread.c \ + sha1_thread.c \ + sha256_thread.c \ + sha512_thread.c \ + aes_thread.c + +ODIR = bin +objects = $(addprefix $(ODIR)/, $(patsubst %.c, %.o, $(source))) + +$(test): $(objects) + gcc $? $(LDLIBS) -o $@ + +$(ODIR): ; mkdir -p $(ODIR) +$(objects): | $(ODIR) +$(ODIR)/%.o: %.c + gcc -c $(CFLAGS) $< -o $@ + +clean: + @echo Cleaning up + @rm -fr $(ODIR) $(test) diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt new file mode 100644 index 000000000..60335f76c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt @@ -0,0 +1,25 @@ +/* + * Saturation Test + * Written by Xiaodong Liu + */ + +This tool is used to judge the saturation performance of ISA-L's multi-buffer hash and other algorithms. +It can be used to give a comparision between multi-buffer hash and OpenSSL's single buffer hash. + +Compilation: +(Make sure isa-l_crypto library is already installed. Other libs requried are openssl and pthread.) +make + +Usage: ./isal_multithread_perf -n num_threads + -v verbose output + -t time to run(secs) + -n number of algorithm threads + -l len of each buffer(KB) + -a memory copy before algorithm -- 1 do(default); 0 not do + -b memory copy after algorithm -- 1 do(default); 0 not do + -m method of algorithm: md5 md5_mb sha1 sha1_mb sha256 sha256_mb + sha512 sha512_mb cbc_128_dec cbc_192_dec cbc_256_dec xts_128_enc + xts_256_enc gcm_128_enc gcm_256_enc + +Example: +./isal_multithread_perf -m md5 -n 10 diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c new file mode 100644 index 000000000..366fc9bcf --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c @@ -0,0 +1,380 @@ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "isal_multithread_perf.h" + +struct aes_context { + int const bits; + int (*const preproc)(struct aes_context * pCtx); + void (*const processor)(struct aes_context * pCtx, char *plaintext, + char *ciphertext, uint64_t len); + void (*const postproc)(struct aes_context * pCtx); +}; + +#define rounds_buf 2 /* first one is plain text, second is cipher text */ + +static uint64_t aes_thread_func(int32_t id, struct aes_context *pCtx) +{ + uint32_t i = 0, j = 0; + char *aes_buf[rounds_buf] = { NULL }; /* aes buf is used to do checksum compute */ + char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */ + uint64_t round = -1; + struct timeval start_tv, stop_tv; + long long secs = run_secs; + + printfv("Thread %i is started\n", id); + /* memory allocate */ + for (j = 0; j < rounds_buf; j++) { + carry_buf[j] = (char *)calloc((size_t)buflen, 1); + if (carry_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + aes_buf[j] = (char *)calloc((size_t)buflen, 1); + if (aes_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + /* Create the random data */ + for (i = 0; i < buflen; i += 1024) { + carry_buf[j][i] = i % 256; + aes_buf[j][i] = i % 256; + } + } + + if (pCtx->preproc(pCtx)) { + printf("preproc failed test aborted\n"); + goto out; + } + + /* Thread sync */ + pthread_mutex_lock(&count_lock); + count++; + if (count == num_threads) { + pthread_cond_broadcast(&count_cond); + } else { + pthread_cond_wait(&count_cond, &count_lock); + } + pthread_mutex_unlock(&count_lock); + + printfv("Thread %i is ready\n", id); + /* hash func starts to run */ + round = 0; + gettimeofday(&start_tv, 0); + gettimeofday(&stop_tv, 0); + while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) { + /* Pre mem-operation */ + if (prememcpy) + memcpy(aes_buf[0], carry_buf[0], buflen); + + /* Calculate checksum */ + pCtx->processor(pCtx, aes_buf[0], aes_buf[1], buflen); + + /* Post mem-operation */ + if (postmemcpy) + memcpy(carry_buf[1], aes_buf[1], buflen); + + round++; + + gettimeofday(&stop_tv, 0); + } + printfv("thread %2i, aes_func rounds %ld\n", id, round); + + out: + pCtx->postproc(pCtx); + + for (j = 0; j < rounds_buf; j++) { + free(carry_buf[j]); + free(aes_buf[j]); + } + + return round; +} + +/* + * facilities for AES-CBC + */ +static unsigned char const ic[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f +}; + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +/* thread functions for cbc dec */ +struct cbc_context { + struct aes_context base; + uint8_t *iv; + uint8_t key[CBC_256_BITS]; + struct cbc_key_data *key_data; +}; + +static int cbc_dec_pre(struct aes_context *p) +{ + struct cbc_context *pCtx = (struct cbc_context *)p; + int ret; + + ret = posix_memalign((void **)&pCtx->iv, 16, (CBC_IV_DATA_LEN)); + ret |= posix_memalign((void **)&pCtx->key_data, 16, (sizeof(*pCtx->key_data))); + + if ((0 != ret) || (NULL == pCtx->iv) || (NULL == pCtx->key_data)) + return 1; + + mk_rand_data(pCtx->key, sizeof(pCtx->key)); + memcpy(pCtx->iv, ic, CBC_IV_DATA_LEN); + aes_cbc_precomp(pCtx->key, pCtx->base.bits, pCtx->key_data); + + return 0; +} + +static void cbc_dec_post(struct aes_context *p) +{ + struct cbc_context *pCtx = (struct cbc_context *)p; + + free(pCtx->iv); + free(pCtx->key_data); + + return; +} + +static void cbc_dec_proc(struct aes_context *p, char *plaintext, char *ciphertext, + uint64_t len) +{ + struct cbc_context *pCtx = (struct cbc_context *)p; + + if (pCtx->base.bits == 128) + aes_cbc_dec_128(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext, + len); + else if (pCtx->base.bits == 192) + aes_cbc_dec_192(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext, + len); + else if (pCtx->base.bits == 256) + aes_cbc_dec_256(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext, + len); + else { + printf("unsupported cbc encryption bits %d\n", pCtx->base.bits); + exit(1); + } + + return; +} + +void *cbc_128_dec_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct cbc_context ctx = + { {128, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +void *cbc_192_dec_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct cbc_context ctx = + { {192, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +void *cbc_256_dec_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct cbc_context ctx = + { {256, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +/* + * thread functions for xts enc + */ +struct xts_content { + struct aes_context base; + unsigned char key1[16 * 2]; + unsigned char key2[16 * 2]; + unsigned char tinit[16]; +}; + +static int xts_enc_pre(struct aes_context *p) +{ + struct xts_content *pCtx = (struct xts_content *)p; + + mk_rand_data(pCtx->key1, pCtx->base.bits / 8); + mk_rand_data(pCtx->key2, pCtx->base.bits / 8); + mk_rand_data(pCtx->tinit, sizeof(pCtx->tinit)); + + return 0; +} + +static void xts_enc_post(struct aes_context *p) +{ + return; +} + +static void xts_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext, + uint64_t len) +{ + struct xts_content *pCtx = (struct xts_content *)p; + + if (pCtx->base.bits == 128) + XTS_AES_128_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext, + ciphertext); + else if (pCtx->base.bits == 256) + XTS_AES_256_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext, + ciphertext); + else { + printf("unsupported xts encryption bits %d\n", pCtx->base.bits); + exit(1); + } + + return; +} + +void *xts_128_enc_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct xts_content ctx = + { {128, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +void *xts_256_enc_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct xts_content ctx = + { {256, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +/* + * thread functions for gcm enc + */ +struct gcm_context { + struct aes_context base; + uint8_t *key; + unsigned char *iv; + unsigned char *aad; + unsigned char *gcm_tag; + struct gcm_key_data gkey; + struct gcm_context_data gctx; +}; + +static int gcm_enc_pre(struct aes_context *p) +{ + uint8_t const IVend[] = GCM_IV_END_MARK; + + struct gcm_context *pCtx = (struct gcm_context *)p; + + pCtx->key = malloc(pCtx->base.bits / 8); + pCtx->iv = malloc(GCM_IV_LEN); + pCtx->gcm_tag = malloc(MAX_TAG_LEN); + pCtx->aad = malloc(AAD_LENGTH); + + mk_rand_data(pCtx->aad, AAD_LENGTH); + + mk_rand_data(pCtx->iv, GCM_IV_LEN); + memcpy(&pCtx->iv[GCM_IV_END_START], IVend, sizeof(IVend)); + + mk_rand_data(pCtx->key, pCtx->base.bits / 8); + if (pCtx->base.bits == 128) + aes_gcm_pre_128(pCtx->key, &pCtx->gkey); + else + aes_gcm_pre_256(pCtx->key, &pCtx->gkey); + + return 0; +} + +static void gcm_enc_post(struct aes_context *p) +{ + struct gcm_context *pCtx = (struct gcm_context *)p; + + free(pCtx->key); + free(pCtx->iv); + free(pCtx->gcm_tag); + free(pCtx->aad); + + return; +} + +static void gcm_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext, + uint64_t len) +{ + struct gcm_context *pCtx = (struct gcm_context *)p; + + if (pCtx->base.bits == 128) + aes_gcm_enc_128(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv, + pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN); + else if (pCtx->base.bits == 256) + aes_gcm_enc_256(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv, + pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN); + else { + printf("unsupported gcm encryption bits %d\n", pCtx->base.bits); + exit(1); + } + + return; +} + +void *gcm_128_enc_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct gcm_context ctx = + { {128, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} + +void *gcm_256_enc_func(void *arg) +{ + int32_t id = *((int *)arg); + uint64_t round = -1; + + struct gcm_context ctx = + { {256, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} }; + + round = aes_thread_func(id, &ctx.base); + + pthread_exit((void *)round); +} diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c new file mode 100644 index 000000000..1263fea29 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c @@ -0,0 +1,206 @@ +/** + * @file isal_multithread_perf.c + * @brief It is used to verify high speed algorithm saturation issue + * @details + * usage: taskset -c isal_multithread_perf -m -n + * eg: taskset -c 0-9,20-29 ./isal_multithread_perf -m md5_mb -n 10 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "isal_multithread_perf.h" + +alg_method algs[] = { + {"md5", md5_ossl_func, MD5_MAX_LANES} + , + {"md5_mb", md5_mb_func, MD5_MAX_LANES} + , + {"sha1", sha1_ossl_func, SHA1_MAX_LANES} + , + {"sha1_mb", sha1_mb_func, SHA1_MAX_LANES} + , + {"sha256", sha256_ossl_func, SHA256_MAX_LANES} + , + {"sha256_mb", sha256_mb_func, SHA256_MAX_LANES} + , + {"sha512", sha512_ossl_func, SHA512_MAX_LANES} + , + {"sha512_mb", sha512_mb_func, SHA512_MAX_LANES} + , + {"cbc_128_dec", cbc_128_dec_func, 1} + , + {"cbc_192_dec", cbc_192_dec_func, 1} + , + {"cbc_256_dec", cbc_256_dec_func, 1} + , + {"xts_128_enc", xts_128_enc_func, 1} + , + {"xts_256_enc", xts_256_enc_func, 1} + , + {"gcm_128_enc", gcm_128_enc_func, 1} + , + {"gcm_256_enc", gcm_256_enc_func, 1} + , + + {NULL, NULL} +}; + +/* Global parameters*/ +long long run_secs = 10; +uint32_t num_threads = 2; +uint32_t buflen = 32 * 1024; +uint32_t prememcpy = 0; +uint32_t postmemcpy = 0; +char *method = "md5_mb"; + +/* Global thread sync */ +pthread_mutex_t count_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t count_cond = PTHREAD_COND_INITIALIZER; +volatile uint32_t count = 0; + +int verbose = 0; + +void usage(char *appname) +{ + int i = 0; + printf("Usage: %s -n num_threads\n", appname); + printf("\t-v verbose output\n" + "\t-t time to run(secs)\n" + "\t-n number of algorithm threads\n" + "\t-l len of each buffer(KB)\n" + "\t-a memory copy before algorithm -- 1 do(default); 0 not do\n" + "\t-b memory copy after algorithm -- 1 do(default); 0 not do\n" + "\t-m method of algorithm:"); + for (i = 0; algs[i].name != NULL; i++) + printf(" %s", algs[i].name); + printf("\n"); + +} + +void notice(char *appname, alg_method * alg_choose_p) +{ + int i = 0; + printf("%s starts to run\n", appname); + printf("\tverbose output is %d\n" + "\truntime is %lld(secs)\n" + "\tnumber of algorithm threads is %d\n" + "\tlen of each buffer(KB) is %d\n" + "\tmemory copy before algorithm is %d\n" + "\tmemory copy after algorithm is %d\n" + "\tmethod of algorithm is %s\n", verbose, run_secs, num_threads, buflen / 1024, + prememcpy, postmemcpy, alg_choose_p->name); +} + +int main(int argc, char **argv) +{ + int i = 0; + int opt; + char *optstring = "t:n:m:l:a:b:v"; + int32_t *id = NULL, ret = 0; + alg_method alg_choose; + pthread_t *clients = NULL; + uint64_t count = 0, sum = 0; + uint32_t rounds_buf; + + while ((opt = getopt(argc, argv, optstring)) != -1) { + switch (opt) { + case 't': + run_secs = atol(optarg); + if (run_secs <= 0) { + usage(argv[0]); + exit(-1); + } + break; + case 'n': + num_threads = atoi(optarg); + if (num_threads <= 0) { + usage(argv[0]); + exit(-1); + } + break; + case 'm': + method = optarg; + break; + case 'l': + buflen = atoi(optarg) * 1024; + if (buflen <= 0) { + usage(argv[0]); + exit(-1); + } + break; + case 'a': + prememcpy = atoi(optarg); + if (prememcpy != 0 && prememcpy != 1) { + usage(argv[0]); + exit(-1); + } + break; + case 'b': + postmemcpy = atoi(optarg); + if (postmemcpy != 0 && postmemcpy != 1) { + usage(argv[0]); + exit(-1); + } + break; + case 'v': + verbose = 1; + break; + default: + usage(argv[0]); + exit(0); + } + } + + /* Check method str and set algorithm_func */ + for (i = 0; algs[i].name != NULL; i++) { + if (!strcmp(method, algs[i].name)) { + alg_choose = algs[i]; + break; + } + } + if (algs[i].name == NULL) { + usage(argv[0]); + exit(-1); + } + + notice(argv[0], &alg_choose); + rounds_buf = alg_choose.rounds_nbuf; + + clients = (pthread_t *) calloc(num_threads + 1, sizeof(pthread_t)); + id = (int32_t *) calloc(num_threads + 1, sizeof(int32_t)); + + printf("Start %i threads, use %s function\n", num_threads, alg_choose.name); + + for (i = 0; i < num_threads; i++) { + id[i] = i; + + ret = + pthread_create(&clients[i], NULL, alg_choose.thread_func, (void *)&id[i]); + + if (ret != 0) { + printf("Failed to create thread %i: %s", i, strerror(ret)); + exit(-1); + } + printfv("Thread %i is created\n", i); + } + + for (i = 0; i < num_threads; i++) { + pthread_join(clients[i], (void *)&count); + sum += count; + } + double loop_unit = ((double)buflen) * rounds_buf / run_secs / 1024 / 1024; + printf("Sum of rounds is %ld\n" + "Average throughput(MB/s) is %.2f\n" + "Total throughput(MB/s) is %.2f\n", + sum, (double)sum / i * loop_unit, (double)sum * loop_unit); + + exit(0); +} diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h new file mode 100644 index 000000000..4f38705dd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h @@ -0,0 +1,52 @@ + +#ifndef ISAL_MULTITHREAD_PERF_H_ +#define ISAL_MULTITHREAD_PERF_H_ + +#include "isa-l_crypto.h" + +/* multibuffer hash */ +void *md5_ossl_func(void *arg); +void *md5_mb_func(void *arg); +void *sha1_ossl_func(void *arg); +void *sha1_mb_func(void *arg); +void *sha256_ossl_func(void *arg); +void *sha256_mb_func(void *arg); +void *sha512_ossl_func(void *arg); +void *sha512_mb_func(void *arg); + +/* aes */ +void *cbc_128_dec_func(void *arg); +void *cbc_192_dec_func(void *arg); +void *cbc_256_dec_func(void *arg); +void *xts_128_enc_func(void *arg); +void *xts_256_enc_func(void *arg); +#define AAD_LENGTH 16 +void *gcm_128_enc_func(void *arg); +void *gcm_256_enc_func(void *arg); + + +typedef struct { + char *name; + void *(*thread_func) (void *arg); + uint32_t rounds_nbuf; /* bufs number of one processing round */ +} alg_method; + + +/* Global parameters*/ +extern long long run_secs; +extern uint32_t num_threads; +extern uint32_t buflen; +extern uint32_t prememcpy; +extern uint32_t postmemcpy; + +extern pthread_mutex_t count_lock; +extern pthread_cond_t count_cond; +extern volatile uint32_t count; + +extern int verbose; +#define printfv(format, args...) { \ + if (verbose) \ + printf (format, ##args); \ +} + +#endif /* ISAL_MULTITHREAD_PERF_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c new file mode 100644 index 000000000..f63b3785b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c @@ -0,0 +1,213 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "isal_multithread_perf.h" + +#ifndef HASH_THREAD +/* MD5 related params and structures*/ +#define DIGEST_NWORDS MD5_DIGEST_NWORDS +#define MB_BUFS MD5_MAX_LANES +#define HASH_CTX_MGR MD5_HASH_CTX_MGR +#define HASH_CTX MD5_HASH_CTX + +#define OSSL_THREAD_FUNC md5_ossl_func +#define OSSL_HASH_FUNC MD5 +#define MB_THREAD_FUNC md5_mb_func +#define CTX_MGR_INIT md5_ctx_mgr_init +#define CTX_MGR_SUBMIT md5_ctx_mgr_submit +#define CTX_MGR_FLUSH md5_ctx_mgr_flush + +#define rounds_buf MD5_MAX_LANES + +#endif // HASH_THREAD + +typedef uint32_t hash_digests[DIGEST_NWORDS]; + +void *OSSL_THREAD_FUNC(void *arg) +{ + int32_t id = *((int *)arg); + uint32_t i = 0, j = 0; + char *hash_buf[rounds_buf] = { NULL }; /* hash buf is used to do hash compute */ + char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */ + hash_digests digest; + uint64_t round = -1; + struct timeval start_tv, stop_tv; + long long secs = run_secs; + + printfv("Thread %i is started\n", id); + /* memory allocate */ + for (j = 0; j < rounds_buf; j++) { + carry_buf[j] = (char *)calloc((size_t)buflen, 1); + if (carry_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + hash_buf[j] = (char *)calloc((size_t)buflen, 1); + if (hash_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + /* Create the random data */ + for (i = 0; i < buflen; i += 1024) { + carry_buf[j][i] = i % 256; + hash_buf[j][i] = i % 256; + } + } + + /* Thread sync */ + pthread_mutex_lock(&count_lock); + count++; + if (count == num_threads) { + pthread_cond_broadcast(&count_cond); + } else { + pthread_cond_wait(&count_cond, &count_lock); + } + pthread_mutex_unlock(&count_lock); + + printfv("Thread %i is ready\n", id); + /* hash func starts to run */ + round = 0; + gettimeofday(&start_tv, 0); + gettimeofday(&stop_tv, 0); + while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) { + for (j = 0; j < rounds_buf; j++) { + /* Pre mem-operation */ + if (prememcpy) + memcpy(hash_buf[j], carry_buf[j], buflen); + + /* Calculate hash digest */ + OSSL_HASH_FUNC((char *)hash_buf[j], buflen, (unsigned char *)&digest); + + /* Post mem-operation */ + if (postmemcpy) + memcpy(carry_buf[j], hash_buf[j], buflen); + } + round++; + + gettimeofday(&stop_tv, 0); + } + printfv("thread %2i, openssl_func rounds %ld\n", id, round); + + out: + for (j = 0; j < rounds_buf; j++) { + free(carry_buf[j]); + free(hash_buf[j]); + } + + pthread_exit((void *)round); +} + +void *MB_THREAD_FUNC(void *arg) +{ + int32_t id = *((int *)arg); + uint32_t i = 0, j = 0; + char *hash_buf[rounds_buf] = { NULL }; /* hash buf is used to do hash compute */ + char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */ + hash_digests *digests[rounds_buf]; + uint64_t round = -1; + struct timeval start_tv, stop_tv; + long long secs = run_secs; + int ret; + + HASH_CTX_MGR *mgr = NULL; + HASH_CTX *ctxpool = NULL, *ctx = NULL; + + printfv("Thread %i is started\n", id); + /* Memory allocate */ + for (j = 0; j < rounds_buf; j++) { + carry_buf[j] = (char *)calloc((size_t)buflen, 1); + if (carry_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + hash_buf[j] = (char *)calloc((size_t)buflen, 1); + if (hash_buf[j] == NULL) { + printf("calloc failed test aborted\n"); + goto out; + } + + digests[j] = (hash_digests *) calloc(sizeof(hash_digests), 1); + + /* Create the random data */ + for (i = 0; i < buflen; i += 1024) { + carry_buf[j][i] = i % 256; + hash_buf[j][i] = i % 256; + } + } + + ctxpool = (HASH_CTX *) calloc(rounds_buf, sizeof(HASH_CTX)); + for (i = 0; i < rounds_buf; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + ret = posix_memalign((void *)&mgr, 16, sizeof(HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + goto out; + } + CTX_MGR_INIT(mgr); + + printfv("Thread %i gets to wait\n", id); + /* Thread sync */ + pthread_mutex_lock(&count_lock); + count++; + if (count == num_threads) { + pthread_cond_broadcast(&count_cond); + } else { + pthread_cond_wait(&count_cond, &count_lock); + } + pthread_mutex_unlock(&count_lock); + + printfv("Thread %i is ready\n", id); + /* hash func starts to run */ + round = 0; + gettimeofday(&start_tv, 0); + gettimeofday(&stop_tv, 0); + while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) { + for (j = 0; j < rounds_buf; j += MB_BUFS) { + for (i = 0; i < MB_BUFS; i++) { + /* Pre mem-operation */ + if (prememcpy) + memcpy(hash_buf[j + i], carry_buf[j + i], buflen); + + CTX_MGR_SUBMIT(mgr, &ctxpool[j + i], hash_buf[j + i], buflen, + HASH_ENTIRE); + } + + /* Calculate hash digest */ + while (CTX_MGR_FLUSH(mgr)) ; + for (i = 0; i < MB_BUFS; i++) { + /* Post mem-operation */ + if (postmemcpy) + memcpy(carry_buf[j + i], hash_buf[j + i], buflen); + } + } + round++; + + gettimeofday(&stop_tv, 0); + } + printfv("thread %2i, multibuffer_func rounds %ld\n", id, round); + + out: + free(ctxpool); + free(mgr); + for (j = 0; j < rounds_buf; j++) { + free(carry_buf[j]); + free(digests[j]); + free(hash_buf[j]); + } + + pthread_exit((void *)round); +} diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c new file mode 100644 index 000000000..5ec7eb04a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c @@ -0,0 +1,20 @@ + +#define HASH_THREAD +/* sha1 related params and structures*/ +#define DIGEST_NWORDS SHA1_DIGEST_NWORDS +#define MB_BUFS SHA1_MAX_LANES +#define HASH_CTX_MGR SHA1_HASH_CTX_MGR +#define HASH_CTX SHA1_HASH_CTX + +#define OSSL_THREAD_FUNC sha1_ossl_func +#define OSSL_HASH_FUNC SHA1 +#define MB_THREAD_FUNC sha1_mb_func +#define CTX_MGR_INIT sha1_ctx_mgr_init +#define CTX_MGR_SUBMIT sha1_ctx_mgr_submit +#define CTX_MGR_FLUSH sha1_ctx_mgr_flush + +#define rounds_buf SHA1_MAX_LANES + +#include "md5_thread.c" + +#undef HASH_THREAD diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c new file mode 100644 index 000000000..c155c19d4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c @@ -0,0 +1,20 @@ + +#define HASH_THREAD +/* sha256 related params and structures*/ +#define DIGEST_NWORDS SHA256_DIGEST_NWORDS +#define MB_BUFS SHA256_MAX_LANES +#define HASH_CTX_MGR SHA256_HASH_CTX_MGR +#define HASH_CTX SHA256_HASH_CTX + +#define OSSL_THREAD_FUNC sha256_ossl_func +#define OSSL_HASH_FUNC SHA256 +#define MB_THREAD_FUNC sha256_mb_func +#define CTX_MGR_INIT sha256_ctx_mgr_init +#define CTX_MGR_SUBMIT sha256_ctx_mgr_submit +#define CTX_MGR_FLUSH sha256_ctx_mgr_flush + +#define rounds_buf SHA256_MAX_LANES + +#include "md5_thread.c" + +#undef HASH_THREAD diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c new file mode 100644 index 000000000..5861835a8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c @@ -0,0 +1,20 @@ + +#define HASH_THREAD +/* sha512 related params and structures*/ +#define DIGEST_NWORDS (SHA512_DIGEST_NWORDS * 2) +#define MB_BUFS SHA512_MAX_LANES +#define HASH_CTX_MGR SHA512_HASH_CTX_MGR +#define HASH_CTX SHA512_HASH_CTX + +#define OSSL_THREAD_FUNC sha512_ossl_func +#define OSSL_HASH_FUNC SHA512 +#define MB_THREAD_FUNC sha512_mb_func +#define CTX_MGR_INIT sha512_ctx_mgr_init +#define CTX_MGR_SUBMIT sha512_ctx_mgr_submit +#define CTX_MGR_FLUSH sha512_ctx_mgr_flush + +#define rounds_buf SHA512_MAX_LANES + +#include "md5_thread.c" + +#undef HASH_THREAD diff --git a/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h new file mode 100644 index 000000000..a8f81b232 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h @@ -0,0 +1,301 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef __AARCH64_MULTIBINARY_H__ +#define __AARCH64_MULTIBINARY_H__ +#ifndef __aarch64__ +#error "This file is for aarch64 only" +#endif +#include +#ifdef __ASSEMBLY__ +/** + * # mbin_interface : the wrapper layer for isal-l api + * + * ## references: + * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S + * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf + * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310 + * + * ## Usage: + * 1. Define dispather function + * 2. name must be \name\()_dispatcher + * 3. Prototype should be *"void * \name\()_dispatcher"* + * 4. The dispather should return the right function pointer , revision and a string information . + **/ +.macro mbin_interface name:req + .extern \name\()_dispatcher + .section .data + .balign 8 + .global \name\()_dispatcher_info + .type \name\()_dispatcher_info,%object + + \name\()_dispatcher_info: + .quad \name\()_mbinit //func_entry + + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info + + .balign 8 + .text + \name\()_mbinit: + //save lp fp, sub sp + .cfi_startproc + stp x29, x30, [sp, -224]! + + //add cfi directive to avoid GDB bt cmds error + //set cfi(Call Frame Information) + .cfi_def_cfa_offset 224 + .cfi_offset 29, -224 + .cfi_offset 30, -216 + + //save parameter/result/indirect result registers + stp x8, x9, [sp, 16] + .cfi_offset 8, -208 + .cfi_offset 9, -200 + stp x0, x1, [sp, 32] + .cfi_offset 0, -192 + .cfi_offset 1, -184 + stp x2, x3, [sp, 48] + .cfi_offset 2, -176 + .cfi_offset 3, -168 + stp x4, x5, [sp, 64] + .cfi_offset 4, -160 + .cfi_offset 5, -152 + stp x6, x7, [sp, 80] + .cfi_offset 6, -144 + .cfi_offset 7, -136 + stp q0, q1, [sp, 96] + .cfi_offset 64, -128 + .cfi_offset 65, -112 + stp q2, q3, [sp, 128] + .cfi_offset 66, -96 + .cfi_offset 67, -80 + stp q4, q5, [sp, 160] + .cfi_offset 68, -64 + .cfi_offset 69, -48 + stp q6, q7, [sp, 192] + .cfi_offset 70, -32 + .cfi_offset 71, -16 + + /** + * The dispatcher functions have the following prototype: + * void * function_dispatcher(void) + * As the dispatcher is returning a struct, by the AAPCS, + */ + + + bl \name\()_dispatcher + //restore temp/indirect result registers + ldp x8, x9, [sp, 16] + .cfi_restore 8 + .cfi_restore 9 + + // save function entry + str x0, [x9] + + //restore parameter/result registers + ldp x0, x1, [sp, 32] + .cfi_restore 0 + .cfi_restore 1 + ldp x2, x3, [sp, 48] + .cfi_restore 2 + .cfi_restore 3 + ldp x4, x5, [sp, 64] + .cfi_restore 4 + .cfi_restore 5 + ldp x6, x7, [sp, 80] + .cfi_restore 6 + .cfi_restore 7 + ldp q0, q1, [sp, 96] + .cfi_restore 64 + .cfi_restore 65 + ldp q2, q3, [sp, 128] + .cfi_restore 66 + .cfi_restore 67 + ldp q4, q5, [sp, 160] + .cfi_restore 68 + .cfi_restore 69 + ldp q6, q7, [sp, 192] + .cfi_restore 70 + .cfi_restore 71 + //save lp fp and sp + ldp x29, x30, [sp], 224 + //restore cfi setting + .cfi_restore 30 + .cfi_restore 29 + .cfi_def_cfa_offset 0 + .cfi_endproc + + .global \name + .type \name,%function + .align 2 + \name\(): + adrp x9, :got:\name\()_dispatcher_info + ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info] + ldr x10,[x9] + br x10 + .size \name,. - \name + +.endm + +/** + * mbin_interface_base is used for the interfaces which have only + * noarch implementation + */ +.macro mbin_interface_base name:req, base:req + .extern \base + .section .data + .balign 8 + .global \name\()_dispatcher_info + .type \name\()_dispatcher_info,%object + + \name\()_dispatcher_info: + .quad \base //func_entry + .size \name\()_dispatcher_info,. - \name\()_dispatcher_info + + .balign 8 + .text + .global \name + .type \name,%function + .align 2 + \name\(): + adrp x9, :got:\name\()_dispatcher_info + ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info] + ldr x10,[x9] + br x10 + .size \name,. - \name + +.endm + +#else /* __ASSEMBLY__ */ +#include + + + +#define DEFINE_INTERFACE_DISPATCHER(name) \ + void * name##_dispatcher(void) + +#define PROVIDER_BASIC(name) \ + PROVIDER_INFO(name##_base) + +#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x +#define DO_PRAGMA(x) _Pragma (#x) +#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x) +#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push) +#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop) + + +#define PROVIDER_INFO(_func_entry) \ + ({ DIGNOSTIC_PUSH() \ + DIGNOSTIC_IGNORE(-Wnested-externs) \ + extern void _func_entry(void); \ + DIGNOSTIC_POP() \ + _func_entry; \ + }) + +/** + * Micro-Architector definitions + * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1 + */ + +#define CPU_IMPLEMENTER_RESERVE 0x00 +#define CPU_IMPLEMENTER_ARM 0x41 + + +#define CPU_PART_CORTEX_A57 0xD07 +#define CPU_PART_CORTEX_A72 0xD08 +#define CPU_PART_NEOVERSE_N1 0xD0C + +#define MICRO_ARCH_ID(imp,part) \ + (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4)) + +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1<<11) +#endif + +/** + * @brief get_micro_arch_id + * read micro-architector register instruction if possible.This function + * provides microarchitecture information and make microarchitecture optimization + * possible. It will trap into kernel due to mrs instruction. So it should + * be called only in dispatcher, that will be called only once in program + * lifecycle. And HWCAP must be match,That will make sure there are no + * illegal instruction errors. + * + * NOTICE: + * - HWCAP_CPUID should be available. Otherwise it returns zero + * - It MUST be called inside dispather. + * - It MUST meet the HWCAP requirements + * + * Example: + * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) + * { + * unsigned long auxval = getauxval(AT_HWCAP); + * // MUST do the judgement is MUST. + * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) { + * switch (get_micro_arch_id()) { + * case MICRO_ARCH_ID(ARM, CORTEX_A57): + * return PROVIDER_INFO(crc32_pmull_crc_for_a57); + * case MICRO_ARCH_ID(ARM, CORTEX_A72): + * return PROVIDER_INFO(crc32_pmull_crc_for_a72); + * case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + * return PROVIDER_INFO(crc32_pmull_crc_for_n1); + * case default: + * return PROVIDER_INFO(crc32_pmull_crc_for_others); + * } + * } + * return PROVIDER_BASIC(crc32_iscsi); + * } + * KNOWN ISSUE: + * On a heterogeneous system (big.LITTLE), it will work but the performance + * might not be the best one as expected. + * + * If this function is called on the big core, it will return the function + * optimized for the big core. + * + * If execution is then scheduled to the little core. It will still work (1), + * but the function won't be optimized for the little core, thus the performance + * won't be as expected. + * + * References: + * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5) + * + */ +static inline uint32_t get_micro_arch_id(void) +{ + uint32_t id=CPU_IMPLEMENTER_RESERVE; + if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) { + + asm("mrs %0, MIDR_EL1 " : "=r" (id)); + } + return id&0xff00fff0; +} + + + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h new file mode 100644 index 000000000..aaf87ada1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h @@ -0,0 +1,165 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/** + * @file aes_cbc.h + * @brief AES CBC encryption/decryption function prototypes. + * + */ +#ifndef _AES_CBC_h +#define _AES_CBC_h + +#include + +#ifdef __cplusplus +extern "C" { + +#endif + +typedef enum cbc_key_size { CBC_128_BITS = 16, CBC_192_BITS = 24, CBC_256_BITS = 32} cbc_key_size; +#define CBC_ROUND_KEY_LEN (16) +#define CBC_128_KEY_ROUNDS (10+1) /*expanded key holds 10 key rounds plus original key*/ +#define CBC_192_KEY_ROUNDS (12+1) /*expanded key holds 12 key rounds plus original key*/ +#define CBC_256_KEY_ROUNDS (14+1) /*expanded key holds 14 key rounds plus original key*/ +#define CBC_MAX_KEYS_SIZE (CBC_ROUND_KEY_LEN * CBC_256_KEY_ROUNDS) + +#define CBC_IV_DATA_LEN (16) + +/** @brief holds intermediate key data used in encryption/decryption + * + */ +struct cbc_key_data { // must be 16 byte aligned + uint8_t enc_keys[CBC_MAX_KEYS_SIZE]; + uint8_t dec_keys[CBC_MAX_KEYS_SIZE]; +}; + +/** @brief CBC-AES key pre-computation done once for a key + * + * @requires SSE4.1 and AESNI + * + * arg 1: in: pointer to key + * arg 2: OUT: pointer to a key expanded data + */ +int aes_cbc_precomp( + uint8_t *key, + int key_size, + struct cbc_key_data *keys_blk +); + +/** @brief CBC-AES 128 bit key Decryption + * + * @requires SSE4.1 and AESNI + * + * arg 1: in: pointer to input (cipher text) + * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary + * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds + * arg 4: OUT: pointer to output (plain text ... in-place allowed) + * arg 5: len_bytes: length in bytes (multiple of 16) + */ +void aes_cbc_dec_128( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); + +/** @brief CBC-AES 192 bit key Decryption + * +* @requires SSE4.1 and AESNI +* +*/ +void aes_cbc_dec_192( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); + +/** @brief CBC-AES 256 bit key Decryption + * +* @requires SSE4.1 and AESNI +* +*/ +void aes_cbc_dec_256( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); + +/** @brief CBC-AES 128 bit key Encryption + * + * @requires SSE4.1 and AESNI + * + * arg 1: in: pointer to input (plain text) + * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary + * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds + * arg 4: OUT: pointer to output (cipher text ... in-place allowed) + * arg 5: len_bytes: length in bytes (multiple of 16) + */ +int aes_cbc_enc_128( + void *in, //!< Input plain text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data + void *out, //!< Output cipher text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); +/** @brief CBC-AES 192 bit key Encryption + * +* @requires SSE4.1 and AESNI +* +*/ +int aes_cbc_enc_192( + void *in, //!< Input plain text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data + void *out, //!< Output cipher text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); + +/** @brief CBC-AES 256 bit key Encryption + * +* @requires SSE4.1 and AESNI +* +*/ +int aes_cbc_enc_256( + void *in, //!< Input plain text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data + void *out, //!< Output cipher text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); + +#ifdef __cplusplus +} +#endif //__cplusplus +#endif //ifndef _AES_CBC_h diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h new file mode 100644 index 000000000..b407b7f6b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h @@ -0,0 +1,613 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/** + * @file aes_gcm.h + * @brief AES GCM encryption/decryption function prototypes. + * + * At build time there is an option to use non-temporal loads and stores + * selected by defining the compile time option NT_LDST. The use of this option + * places the following restriction on the gcm encryption functions: + * + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * + * - When using the streaming API, all partial input buffers must be a multiple + * of 64 bytes long except for the last input buffer. + * + * - In-place encryption/decryption is not recommended. + * + */ + +/* +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; Assumptions: Support for SSE4.1 or greater, AVX or AVX2 +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; + */ +#ifndef _AES_GCM_h +#define _AES_GCM_h + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */ +#define MAX_TAG_LEN (16) +// +// IV data is limited to 16 bytes. The last DWORD (4 bytes) must be 0x1 +// +#define GCM_IV_LEN (16) +#define GCM_IV_DATA_LEN (12) +#define GCM_IV_END_MARK {0x00, 0x00, 0x00, 0x01}; +#define GCM_IV_END_START (12) + +#define LONGEST_TESTED_AAD_LENGTH (2* 1024) + +// Key lengths of 128 and 256 supported +#define GCM_128_KEY_LEN (16) +#define GCM_256_KEY_LEN (32) + +#define GCM_BLOCK_LEN 16 +#define GCM_ENC_KEY_LEN 16 +#define GCM_KEY_SETS (15) /*exp key + 14 exp round keys*/ + +/** + * @brief holds intermediate key data needed to improve performance + * + * gcm_data hold internal key information used by gcm128 and gcm256. + */ +struct gcm_data { + uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; + uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here + uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here + uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here + uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here + uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here + uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here + uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here + uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here + uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) + uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) + // init, update and finalize context data + uint8_t aad_hash[GCM_BLOCK_LEN]; + uint64_t aad_length; + uint64_t in_length; + uint8_t partial_block_enc_key[GCM_BLOCK_LEN]; + uint8_t orig_IV[GCM_BLOCK_LEN]; + uint8_t current_counter[GCM_BLOCK_LEN]; + uint64_t partial_block_length; +}; + +/** + * @brief holds intermediate key data needed to improve performance + * + * gcm_key_data hold internal key information used by gcm128, gcm192 and gcm256. + */ +#ifdef __WIN32 +__declspec(align(16)) +#endif /* WIN32 */ +struct gcm_key_data { + uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; + uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here + uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here + uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here + uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here + uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here + uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here + uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here + uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here + uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits + uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // and Low 64b of HashKey^n <<1 mod poly + uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // here (for Karatsuba purposes) + uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; + uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; + uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; + uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; + uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; +#ifdef GCM_BIG_DATA + uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (128 - 16)]; // Big data version needs 128 +#else + uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (48 - 16)]; // Others vaes version needs 48 +#endif +} +#if defined (__unix__) || (__APPLE__) || (__MINGW32__) + __attribute__ ((aligned (16))); +#else + ; +#endif + +/** + * @brief holds GCM operation context + */ +struct gcm_context_data { + // init, update and finalize context data + uint8_t aad_hash[GCM_BLOCK_LEN]; + uint64_t aad_length; + uint64_t in_length; + uint8_t partial_block_enc_key[GCM_BLOCK_LEN]; + uint8_t orig_IV[GCM_BLOCK_LEN]; + uint8_t current_counter[GCM_BLOCK_LEN]; + uint64_t partial_block_length; +}; + +/* ------------------ New interface for separate expanded keys ------------ */ + +/** + * @brief GCM-AES Encryption using 128 bit keys + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_128( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed + uint8_t const *in, //!< Plaintext input + uint64_t len, //!< Length of data in Bytes for encryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief GCM-AES Encryption using 256 bit keys + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_256( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed + uint8_t const *in, //!< Plaintext input + uint64_t len, //!< Length of data in Bytes for encryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + + +/** + * @brief GCM-AES Decryption using 128 bit keys + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_128( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed + uint8_t const *in, //!< Ciphertext input + uint64_t len, //!< Length of data in Bytes for decryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief GCM-AES Decryption using 128 bit keys + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_256( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed + uint8_t const *in, //!< Ciphertext input + uint64_t len, //!< Length of data in Bytes for decryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + + +/** + * @brief Start a AES-GCM Encryption message 128 bit key + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_init_128( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *iv, //!< Pointer to 12 byte IV structure + //!< Internally, library concates 0x00000001 value to it + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len //!< Length of AAD + ); + +/** + * @brief Start a AES-GCM Encryption message 256 bit key + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_init_256( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *iv, //!< Pointer to 12 byte IV structure + //!< Internally, library concates 0x00000001 value to it + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len //!< Length of AAD + ); + +/** + * @brief Encrypt a block of a AES-128-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_128_update( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed. + const uint8_t *in, //!< Plaintext input + uint64_t len //!< Length of data in Bytes for encryption + ); + +/** + * @brief Encrypt a block of a AES-256-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_256_update( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed. + const uint8_t *in, //!< Plaintext input + uint64_t len //!< Length of data in Bytes for encryption + ); + +/** + * @brief Decrypt a block of a AES-128-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_128_update( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed. + const uint8_t *in, //!< Ciphertext input + uint64_t len //!< Length of data in Bytes for decryption + ); + +/** + * @brief Decrypt a block of a AES-256-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_256_update( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed. + const uint8_t *in, //!< Ciphertext input + uint64_t len //!< Length of data in Bytes for decryption + ); + +/** + * @brief End encryption of a AES-128-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_128_finalize( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief End encryption of a AES-256-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_256_finalize( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief End decryption of a AES-128-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_128_finalize( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief End decryption of a AES-256-GCM Encryption message + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_256_finalize( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief Pre-processes GCM key data 128 bit + * + * Prefills the gcm key data with key values for each round and + * the initial sub hash key for tag encoding + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_pre_128( + const void *key, //!< Pointer to key data + struct gcm_key_data *key_data //!< GCM expanded key data + ); + +/** + * @brief Pre-processes GCM key data 128 bit + * + * Prefills the gcm key data with key values for each round and + * the initial sub hash key for tag encoding + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_pre_256( + const void *key, //!< Pointer to key data + struct gcm_key_data *key_data //!< GCM expanded key data + ); + + + +/* ---- NT versions ---- */ +/** + * @brief GCM-AES Encryption using 128 bit keys, Non-temporal data + * + * Non-temporal version of encrypt has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_128_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed + uint8_t const *in, //!< Plaintext input + uint64_t len, //!< Length of data in Bytes for encryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief GCM-AES Encryption using 256 bit keys, Non-temporal data + * + * Non-temporal version of encrypt has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_256_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed + uint8_t const *in, //!< Plaintext input + uint64_t len, //!< Length of data in Bytes for encryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + + +/** + * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data + * + * Non-temporal version of decrypt has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_128_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed + uint8_t const *in, //!< Ciphertext input + uint64_t len, //!< Length of data in Bytes for decryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + +/** + * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data + * + * Non-temporal version of decrypt has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_256_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed + uint8_t const *in, //!< Ciphertext input + uint64_t len, //!< Length of data in Bytes for decryption + uint8_t *iv, //!< iv pointer to 12 byte IV structure. + //!< Internally, library concates 0x00000001 value to it. + uint8_t const *aad, //!< Additional Authentication Data (AAD) + uint64_t aad_len, //!< Length of AAD + uint8_t *auth_tag, //!< Authenticated Tag output + uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). + //!< Valid values are 16 (most likely), 12 or 8 + ); + + +/** + * @brief Encrypt a block of a AES-128-GCM Encryption message, Non-temporal data + * + * Non-temporal version of encrypt update has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - All partial input buffers must be a multiple of 64 bytes long except for + * the last input buffer. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_128_update_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed. + const uint8_t *in, //!< Plaintext input + uint64_t len //!< Length of data in Bytes for encryption + ); + +/** + * @brief Encrypt a block of a AES-256-GCM Encryption message, Non-temporal data + * + * Non-temporal version of encrypt update has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - All partial input buffers must be a multiple of 64 bytes long except for + * the last input buffer. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_enc_256_update_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed. + const uint8_t *in, //!< Plaintext input + uint64_t len //!< Length of data in Bytes for encryption + ); + +/** + * @brief Decrypt a block of a AES-128-GCM Encryption message, Non-temporal data + * + * Non-temporal version of decrypt update has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - All partial input buffers must be a multiple of 64 bytes long except for + * the last input buffer. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_128_update_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed. + const uint8_t *in, //!< Ciphertext input + uint64_t len //!< Length of data in Bytes for decryption + ); + +/** + * @brief Decrypt a block of a AES-256-GCM Encryption message, Non-temporal data + * + * Non-temporal version of decrypt update has additional restrictions: + * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary. + * - All partial input buffers must be a multiple of 64 bytes long except for + * the last input buffer. + * - In-place encryption/decryption is not recommended. Performance can be slow. + * + * @requires SSE4.1 and AESNI + */ +void aes_gcm_dec_256_update_nt( + const struct gcm_key_data *key_data, //!< GCM expanded key data + struct gcm_context_data *context_data, //!< GCM operation context data + uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed. + const uint8_t *in, //!< Ciphertext input + uint64_t len //!< Length of data in Bytes for decryption + ); + + +#ifdef __cplusplus +} +#endif //__cplusplus +#endif //ifndef _AES_GCM_h diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h new file mode 100644 index 000000000..6ecded301 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h @@ -0,0 +1,76 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _KEYEXP_128_H +#define _KEYEXP_128_H + +/** + * @file aes_keyexp.h + * @brief AES key expansion functions + * + * This defines the interface to key expansion functions. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @brief AES key expansion 128 bit +* @requires SSE4.1 +*/ +void aes_keyexp_128( + const uint8_t *key, //!< input key for AES-128, 16 bytes + uint8_t *exp_key_enc, //!< expanded encryption keys, 16*11 bytes + uint8_t *exp_key_dec //!< expanded decryption keys, 16*11 bytes + ); + +/** @brief AES key expansion 192 bit +* @requires SSE4.1 +*/ +void aes_keyexp_192( + const uint8_t *key, //!< input key for AES-192, 16*1.5 bytes + uint8_t *exp_key_enc, //!< expanded encryption keys, 16*13 bytes + uint8_t *exp_key_dec //!< expanded decryption keys, 16*13 bytes + ); + +/** @brief AES key expansion 256 bit +* @requires SSE4.1 +*/ +void aes_keyexp_256( + const uint8_t *key, //!< input key for AES-256, 16*2 bytes + uint8_t *exp_key_enc, //!< expanded encryption keys, 16*15 bytes + uint8_t *exp_key_dec //!< expanded decryption keys, 16*15 bytes + ); + +#ifdef __cplusplus +} +#endif //__cplusplus +#endif //ifndef _KEYEXP_128_H diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h new file mode 100644 index 000000000..2021284f5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h @@ -0,0 +1,214 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _AES_XTS_H +#define _AES_XTS_H + +/** + * @file aes_xts.h + * @brief AES XTS encryption function prototypes. + * + * This defines the interface to optimized AES XTS functions + * + * Pre-expanded keys + * + * For key encryption, pre-expanded keys are stored in the order that they will be + * used. As an example, if Key[0] is the 128-bit initial key used for an AES-128 + * encryption, the rest of the keys are stored as follows: + * + *
    + *
  • Key[0] : Initial encryption key + *
  • Key[1] : Round 1 encryption key + *
  • Key[2] : Round 2 encryption key + *
  • ... + *
  • Key[10] : Round 10 encryption key + *
+ * + * For decryption, the order of keys is reversed. However, we apply the + * necessary aesimc instructions before storing the expanded keys. For the same key + * used above, the pre-expanded keys will be stored as follows: + * + *
    + *
  • Key[0] : Round 10 encryption key + *
  • Key[1] : aesimc(Round 9 encryption key) + *
  • Key[2] : aesimc(Round 8 encryption key) + *
  • ... + *
  • Key[9] : aesimc(Round 1 encryption key) + *
  • Key[10] : Initial encryption key + *
+ * + * Note: The expanded key decryption requires a decryption key only for the block + * decryption step. The tweak step in the expanded key decryption requires the same expanded + * encryption key that is used in the expanded key encryption. + * + * Input and Output Buffers + * + * The input and output buffers can be overlapping as long as the output buffer + * pointer is not less than the input buffer pointer. If the two pointers are the + * same, then encryption/decryption will occur in-place. + * + * Data Length + * + *
    + *
  • The functions support data length of any bytes greater than or equal to 16 bytes. + *
  • Data length is a 64-bit value, which makes the largest possible data length + * 2^64 - 1 bytes. + *
  • For data lengths from 0 to 15 bytes, the functions return without any error + * codes, without reading or writing any data. + *
  • The functions only support byte lengths, not bits. + *
+ * + * Initial Tweak + * + * The functions accept a 128-bit initial tweak value. The user is responsible for + * padding the initial tweak value to this length. + * + * Data Alignment + * + * The input and output buffers, keys, pre-expanded keys and initial tweak value + * are not required to be aligned to 16 bytes, any alignment works. + * + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @brief XTS-AES-128 Encryption + * @requires AES-NI + */ + +void XTS_AES_128_enc( + uint8_t *k2, //!< key used for tweaking, 16 bytes + uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *pt, //!< plaintext sector input data + uint8_t *ct //!< ciphertext sector output data + ); + +/** @brief XTS-AES-128 Encryption with pre-expanded keys + * @requires AES-NI + */ + +void XTS_AES_128_enc_expanded_key( + uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes + uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *pt, //!< plaintext sector input data + uint8_t *ct //!< ciphertext sector output data + ); + +/** @brief XTS-AES-128 Decryption + * @requires AES-NI + */ + +void XTS_AES_128_dec( + uint8_t *k2, //!< key used for tweaking, 16 bytes + uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *ct, //!< ciphertext sector input data + uint8_t *pt //!< plaintext sector output data + ); + +/** @brief XTS-AES-128 Decryption with pre-expanded keys + * @requires AES-NI + */ + +void XTS_AES_128_dec_expanded_key( + uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used + uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *ct, //!< ciphertext sector input data + uint8_t *pt //!< plaintext sector output data + ); + +/** @brief XTS-AES-256 Encryption + * @requires AES-NI + */ + +void XTS_AES_256_enc( + uint8_t *k2, //!< key used for tweaking, 16*2 bytes + uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16*2 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *pt, //!< plaintext sector input data + uint8_t *ct //!< ciphertext sector output data + ); + +/** @brief XTS-AES-256 Encryption with pre-expanded keys + * @requires AES-NI + */ + +void XTS_AES_256_enc_expanded_key( + uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes + uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *pt, //!< plaintext sector input data + uint8_t *ct //!< ciphertext sector output data + ); + +/** @brief XTS-AES-256 Decryption + * @requires AES-NI + */ + +void XTS_AES_256_dec( + uint8_t *k2, //!< key used for tweaking, 16*2 bytes + uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16*2 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *ct, //!< ciphertext sector input data + uint8_t *pt //!< plaintext sector output data + ); + +/** @brief XTS-AES-256 Decryption with pre-expanded keys + * @requires AES-NI + */ + +void XTS_AES_256_dec_expanded_key( + uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used + uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes + uint8_t *TW_initial, //!< initial tweak value, 16 bytes + uint64_t N, //!< sector size, in bytes + const uint8_t *ct, //!< ciphertext sector input data + uint8_t *pt //!< plaintext sector output data + ); + +#ifdef __cplusplus +} +#endif + +#endif //_AES_XTS_H diff --git a/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm new file mode 100644 index 000000000..3298ce374 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm @@ -0,0 +1,79 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Macros for defining data structures + +; Usage example + +;START_FIELDS ; JOB_AES +;;; name size align +;FIELD _plaintext, 8, 8 ; pointer to plaintext +;FIELD _ciphertext, 8, 8 ; pointer to ciphertext +;FIELD _IV, 16, 8 ; IV +;FIELD _keys, 8, 8 ; pointer to keys +;FIELD _len, 4, 4 ; length in bytes +;FIELD _status, 4, 4 ; status enumeration +;FIELD _user_data, 8, 8 ; pointer to user data +;UNION _union, size1, align1, \ + size2, align2, \ + size3, align3, \ + ... +;END_FIELDS +;%assign _JOB_AES_size _FIELD_OFFSET +;%assign _JOB_AES_align _STRUCT_ALIGN + +%ifndef _DATASTRUCT_ASM_ +%define _DATASTRUCT_ASM_ + +;; START_FIELDS +%macro START_FIELDS 0 +%assign _FIELD_OFFSET 0 +%assign _STRUCT_ALIGN 0 +%endm + +;; FIELD name size align +%macro FIELD 3 +%define %%name %1 +%define %%size %2 +%define %%align %3 + +%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1)) +%%name equ _FIELD_OFFSET +%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size) +%if (%%align > _STRUCT_ALIGN) +%assign _STRUCT_ALIGN %%align +%endif +%endm + +;; END_FIELDS +%macro END_FIELDS 0 +%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +%endm + +%endif ; end ifdef _DATASTRUCT_ASM_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h new file mode 100644 index 000000000..87d90460a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h @@ -0,0 +1,83 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _ENDIAN_HELPER_H_ +#define _ENDIAN_HELPER_H_ + +/** + * @file endian_helper.h + * @brief Byte order helper routines + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined (__ICC) +# define byteswap32(x) _bswap(x) +# define byteswap64(x) _bswap64(x) +#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +# define byteswap32(x) __builtin_bswap32(x) +# define byteswap64(x) __builtin_bswap64(x) +#else +# define byteswap32(x) ( ((x) << 24) \ + | (((x) & 0xff00) << 8) \ + | (((x) & 0xff0000) >> 8) \ + | ((x)>>24)) +# define byteswap64(x) ( (((x) & (0xffull << 0)) << 56) \ + | (((x) & (0xffull << 8)) << 40) \ + | (((x) & (0xffull << 16)) << 24) \ + | (((x) & (0xffull << 24)) << 8) \ + | (((x) & (0xffull << 32)) >> 8) \ + | (((x) & (0xffull << 40)) >> 24) \ + | (((x) & (0xffull << 48)) >> 40) \ + | (((x) & (0xffull << 56)) >> 56)) +#endif + +// This check works when using GCC (or LLVM). Assume little-endian +// if any other compiler is being used. +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) \ + && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define to_le32(x) byteswap32(x) +#define to_le64(x) byteswap64(x) +#define to_be32(x) (x) +#define to_be64(x) (x) +#else +#define to_le32(x) (x) +#define to_le64(x) (x) +#define to_be32(x) byteswap32(x) +#define to_be64(x) byteswap64(x) +#endif + +#ifdef __cplusplus +} +#endif + +#endif // _ISA_HELPER_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h new file mode 100644 index 000000000..3c7ba2877 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file intrinreg.h + * @brief Defines intrinsic types used by the new hashing API + * + */ + +#ifndef _IA64_REGS_H_ +#define _IA64_REGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _MSC_VER +# define inline __inline +#endif + +#include +#include + +// Define available register types uniformly. +/// @cond +typedef struct{ uint8_t dummy; } intrinreg1; +typedef struct{ uint16_t dummy; } intrinreg2; +typedef struct{ uint32_t dummy; } intrinreg4; +typedef struct{ uint64_t dummy; } intrinreg8; +typedef __m128 intrinreg16; +/// @endcond + + +#ifdef __cplusplus +} +#endif + +#endif // _IA64_REGS_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h new file mode 100644 index 000000000..fcbae5f62 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h @@ -0,0 +1,372 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MD5_MB_H_ +#define _MD5_MB_H_ + +/** + * @file md5_mb.h + * @brief Multi-buffer CTX API MD5 function prototypes and structures + * + * Interface for multi-buffer MD5 functions + * + * Multi-buffer MD5 Entire or First-Update..Update-Last + * + * The interface to this multi-buffer hashing code is carried out through the + * context-level (CTX) init, submit and flush functions and the MD5_HASH_CTX_MGR and + * MD5_HASH_CTX objects. Numerous MD5_HASH_CTX objects may be instantiated by the + * application for use with a single MD5_HASH_CTX_MGR. + * + * The CTX interface functions carry out the initialization and padding of the jobs + * entered by the user and add them to the multi-buffer manager. The lower level "scheduler" + * layer then processes the jobs in an out-of-order manner. The scheduler layer functions + * are internal and are not intended to be invoked directly. Jobs can be submitted + * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial + * jobs which can be started using the HASH_FIRST flag, and later resumed or finished + * using the HASH_UPDATE and HASH_LAST flags respectively. + * + * Note: The submit function does not require data buffers to be block sized. + * + * The MD5 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and + * AVX512. In addition, a multibinary interface is provided, which selects the appropriate + * architecture-specific function at runtime. + * + * Usage: The application creates a MD5_HASH_CTX_MGR object and initializes it + * with a call to md5_ctx_mgr_init*() function, where henceforth "*" stands for the + * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512 (or no suffix for the + * multibinary version). The MD5_HASH_CTX_MGR object will be used to schedule processor + * resources, with up to 8 MD5_HASH_CTX objects (or 16 in AVX2 case, 32 in AVX512 case) + * being processed at a time. + * + * Each MD5_HASH_CTX must be initialized before first use by the hash_ctx_init macro + * defined in multi_buffer.h. After initialization, the application may begin computing + * a hash by giving the MD5_HASH_CTX to a MD5_HASH_CTX_MGR using the submit functions + * md5_ctx_mgr_submit*() with the HASH_FIRST flag set. When the MD5_HASH_CTX is + * returned to the application (via this or a later call to md5_ctx_mgr_submit*() or + * md5_ctx_mgr_flush*()), the application can then re-submit it with another call to + * md5_ctx_mgr_submit*(), but without the HASH_FIRST flag set. + * + * Ideally, on the last buffer for that hash, md5_ctx_mgr_submit_sse is called with + * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero + * length if necessary. When a MD5_HASH_CTX is returned after having been submitted with + * HASH_LAST, it will contain a valid hash. The MD5_HASH_CTX can be reused immediately + * by submitting with HASH_FIRST. + * + * For example, you would submit hashes with the following flags for the following numbers + * of buffers: + *
    + *
  • one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE) + *
  • two buffers: HASH_FIRST, HASH_LAST + *
  • three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST + * etc. + *
+ * + * The order in which MD5_CTX objects are returned is in general different from the order + * in which they are submitted. + * + * A few possible error conditions exist: + *
    + *
  • Submitting flags other than the allowed entire/first/update/last values + *
  • Submitting a context that is currently being managed by a MD5_HASH_CTX_MGR. + *
  • Submitting a context after HASH_LAST is used but before HASH_FIRST is set. + *
+ * + * These error conditions are reported by returning the MD5_HASH_CTX immediately after + * a submit with its error member set to a non-zero error code (defined in + * multi_buffer.h). No changes are made to the MD5_HASH_CTX_MGR in the case of an + * error; no processing is done for other hashes. + * + */ + +#include +#include "multi_buffer.h" +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Hash Constants and Typedefs +#define MD5_DIGEST_NWORDS 4 +#define MD5_MAX_LANES 32 +#define MD5_MIN_LANES 8 +#define MD5_BLOCK_SIZE 64 +#define MD5_LOG2_BLOCK_SIZE 6 +#define MD5_PADLENGTHFIELD_SIZE 8 +#define MD5_INITIAL_DIGEST \ + 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 + +typedef uint32_t md5_digest_array[MD5_DIGEST_NWORDS][MD5_MAX_LANES]; +typedef uint32_t MD5_WORD_T; + +/** @brief Scheduler layer - Holds info describing a single MD5 job for the multi-buffer manager */ + +typedef struct { + uint8_t* buffer; //!< pointer to data buffer for this job + uint32_t len; //!< length of buffer for this job in blocks. + DECLARE_ALIGNED(uint32_t result_digest[MD5_DIGEST_NWORDS],64); + JOB_STS status; //!< output job status + void* user_data; //!< pointer for user's job-related data +} MD5_JOB; + +/** @brief Scheduler layer - Holds arguments for submitted MD5 job */ + +typedef struct { + md5_digest_array digest; + uint8_t* data_ptr[MD5_MAX_LANES]; +} MD5_MB_ARGS_X32; + +/** @brief Scheduler layer - Lane data */ + +typedef struct { + MD5_JOB *job_in_lane; +} MD5_LANE_DATA; + +/** @brief Scheduler layer - Holds state for multi-buffer MD5 jobs */ + +typedef struct { + MD5_MB_ARGS_X32 args; + uint32_t lens[MD5_MAX_LANES]; + uint64_t unused_lanes[4]; //!< each byte or nibble is index (0...31 or 15) of unused lanes. + MD5_LANE_DATA ldata[MD5_MAX_LANES]; + uint32_t num_lanes_inuse; +} MD5_MB_JOB_MGR; + +/** @brief Context layer - Holds state for multi-buffer MD5 jobs */ + +typedef struct { + MD5_MB_JOB_MGR mgr; +} MD5_HASH_CTX_MGR; + +/** @brief Context layer - Holds info describing a single MD5 job for the multi-buffer CTX manager */ + +typedef struct { + MD5_JOB job; // Must be at struct offset 0. + HASH_CTX_STS status; //!< Context status flag + HASH_CTX_ERROR error; //!< Context error flag + uint64_t total_length; //!< Running counter of length processed for this CTX's job + const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job + uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes. + uint8_t partial_block_buffer[MD5_BLOCK_SIZE * 2]; //!< CTX partial blocks + uint32_t partial_block_buffer_length; + void* user_data; //!< pointer for user to keep any job-related data +} MD5_HASH_CTX; + +/******************************************************************* + * CTX level API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the context level MD5 multi-buffer manager structure. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void md5_ctx_mgr_init_sse (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new MD5 job to the context level multi-buffer manager. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_submit_sse (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted MD5 jobs and return when complete. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_flush_sse (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the MD5 multi-buffer manager structure. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns void + */ +void md5_ctx_mgr_init_avx (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new MD5 job to the multi-buffer manager. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_submit_avx (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted MD5 jobs and return when complete. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_flush_avx (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the MD5 multi-buffer manager structure. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void md5_ctx_mgr_init_avx2 (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new MD5 job to the multi-buffer manager. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_submit_avx2 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted MD5 jobs and return when complete. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_flush_avx2 (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the MD5 multi-buffer manager structure. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void md5_ctx_mgr_init_avx512 (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new MD5 job to the multi-buffer manager. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_submit_avx512 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted MD5 jobs and return when complete. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_flush_avx512 (MD5_HASH_CTX_MGR* mgr); + +/******************** multibinary function prototypes **********************/ + +/** + * @brief Initialize the MD5 multi-buffer manager structure. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void md5_ctx_mgr_init (MD5_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new MD5 job to the multi-buffer manager. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_submit (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted MD5 jobs and return when complete. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +MD5_HASH_CTX* md5_ctx_mgr_flush (MD5_HASH_CTX_MGR* mgr); + + +/******************************************************************* + * Scheduler (internal) level out-of-order function prototypes + ******************************************************************/ + +void md5_mb_mgr_init_sse (MD5_MB_JOB_MGR *state); +MD5_JOB* md5_mb_mgr_submit_sse (MD5_MB_JOB_MGR *state, MD5_JOB* job); +MD5_JOB* md5_mb_mgr_flush_sse (MD5_MB_JOB_MGR *state); + +#define md5_mb_mgr_init_avx md5_mb_mgr_init_sse +MD5_JOB* md5_mb_mgr_submit_avx (MD5_MB_JOB_MGR *state, MD5_JOB* job); +MD5_JOB* md5_mb_mgr_flush_avx (MD5_MB_JOB_MGR *state); + +void md5_mb_mgr_init_avx2 (MD5_MB_JOB_MGR *state); +MD5_JOB* md5_mb_mgr_submit_avx2 (MD5_MB_JOB_MGR *state, MD5_JOB* job); +MD5_JOB* md5_mb_mgr_flush_avx2 (MD5_MB_JOB_MGR *state); + +void md5_mb_mgr_init_avx512 (MD5_MB_JOB_MGR *state); +MD5_JOB* md5_mb_mgr_submit_avx512 (MD5_MB_JOB_MGR *state, MD5_JOB* job); +MD5_JOB* md5_mb_mgr_flush_avx512 (MD5_MB_JOB_MGR *state); + +#ifdef __cplusplus +} +#endif + +#endif // _MD5_MB_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm new file mode 100644 index 000000000..7cb153540 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm @@ -0,0 +1,615 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifndef __MEMCPY_ASM__ +%define __MEMCPY_ASM__ + +%include "reg_sizes.asm" + + +; This file defines a series of macros to copy small to medium amounts +; of data from memory to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: pointer to src (not modified) +; SIZE : register: length in bytes (not modified) +; TMP0 : 64-bit temp GPR (clobbered) +; TMP1 : 64-bit temp GPR (clobbered) +; XTMP0 : temp XMM (clobbered) +; XTMP1 : temp XMM (clobbered) +; XTMP2 : temp XMM (clobbered) +; XTMP3 : temp XMM (clobbered) +; +; The name indicates the options. The name is of the form: +; memcpy__ +; where: +; is either "sse" or "avx" or "avx2" +; is either "64" or "128" and defines largest value of SIZE +; is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; is blank or "_ret". If blank, the code falls through. If "ret" +; it does a "ret" at the end +; +; For the avx2 versions, the temp XMM registers need to be YMM registers +; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: +; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 +; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 +; +; For example: +; memcpy_sse_64 : SSE, 0 <= size < 64, falls through +; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through +; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret +; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret +; + +%macro memcpy_sse_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 +%endm + +%macro memcpy_sse_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 +%endm + +%macro memcpy_sse_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 +%endm + +%macro memcpy_sse_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 +%endm + +%macro memcpy_sse_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 +%endm + +%macro memcpy_sse_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 +%endm + +%macro memcpy_sse_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 +%endm + +%macro memcpy_sse_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 +%endm + + +%macro memcpy_sse_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 +%endm + +%macro memcpy_sse_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 +%endm + +%macro memcpy_avx_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 +%endm + +%macro memcpy_avx_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 +%endm + +%macro memcpy_avx_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 +%endm + +%macro memcpy_avx_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 +%endm + +%macro memcpy_avx_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 +%endm + +%macro memcpy_avx_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 +%endm + +%macro memcpy_avx_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 +%endm + + +%macro memcpy_avx_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 +%endm + +%macro memcpy_avx_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx2_64 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 +%endm + +%macro memcpy_avx2_64_1 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 +%endm + +%macro memcpy_avx2_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 +%endm + +%macro memcpy_avx2_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 +%endm + +%macro memcpy_avx2_64_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 +%endm + +%macro memcpy_avx2_64_1_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 +%endm + +%macro memcpy_avx2_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2 +%endm + +%macro memcpy_avx2_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2 +%endm + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +%macro __memcpy_int 13 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: pointer to src (not modified) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP0 %4 ; 64-bit temp GPR (clobbered) +%define %%TMP1 %5 ; 64-bit temp GPR (clobbered) +%define %%XTMP0 %6 ; temp XMM (clobbered) +%define %%XTMP1 %7 ; temp XMM (clobbered) +%define %%XTMP2 %8 ; temp XMM (clobbered) +%define %%XTMP3 %9 ; temp XMM (clobbered) +%define %%NOT0 %10 ; if not 0, then assume size cannot be zero +%define %%MAXSIZE %11 ; 128, 64, etc +%define %%USERET %12 ; if not 0, use "ret" at end +%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 + +%if (%%USERET != 0) + %define %%DONE ret +%else + %define %%DONE jmp %%end +%endif + +%if (%%USEAVX != 0) + %define %%MOVDQU vmovdqu +%else + %define %%MOVDQU movdqu +%endif + +%if (%%MAXSIZE >= 128) + test %%SIZE, 64 + jz %%lt64 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + 1*32] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] + + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + 1*32], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + 2*16] + %%MOVDQU %%XTMP3, [%%SRC + 3*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + 2*16], %%XTMP2 + %%MOVDQU [%%DST + 3*16], %%XTMP3 + + %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 64) +%%lt64 + test %%SIZE, 32 + jz %%lt32 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 32) +%%lt32: + test %%SIZE, 16 + jz %%lt16 + %if (%%USEAVX >= 2) + %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] + %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) + %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 16) +%%lt16: + test %%SIZE, 8 + jz %%lt8 + mov %%TMP0, [%%SRC] + mov %%TMP1, [%%SRC + %%SIZE - 8] + mov [%%DST], %%TMP0 + mov [%%DST + %%SIZE - 8], %%TMP1 + %%DONE +%endif + +%if (%%MAXSIZE >= 8) +%%lt8: + test %%SIZE, 4 + jz %%lt4 + mov DWORD(%%TMP0), [%%SRC] + mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] + mov [%%DST], DWORD(%%TMP0) + mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) + %%DONE +%endif + +%if (%%MAXSIZE >= 4) +%%lt4: + test %%SIZE, 2 + jz %%lt2 + movzx DWORD(%%TMP0), word [%%SRC] + movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] + mov [%%DST], WORD(%%TMP0) + mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) + %%DONE +%endif + +%%lt2: +%if (%%NOT0 == 0) + test %%SIZE, 1 + jz %%end +%endif + movzx DWORD(%%TMP0), byte [%%SRC] + mov [%%DST], BYTE(%%TMP0) +%%end: +%if (%%USERET != 0) + ret +%endif +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Utility macro to assist with SIMD shifting +%macro _PSRLDQ 3 +%define %%VEC %1 +%define %%REG %2 +%define %%IMM %3 + +%ifidn %%VEC, SSE + psrldq %%REG, %%IMM +%else + vpsrldq %%REG, %%REG, %%IMM +%endif +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; This section defines a series of macros to store small to medium amounts +; of data from SIMD registers to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP, IDX +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: src data (clobbered) +; SIZE : register: length in bytes (not modified) +; TMP : 64-bit temp GPR (clobbered) +; IDX : 64-bit GPR to store dst index/offset (clobbered) +; +; The name indicates the options. The name is of the form: +; simd_store_ +; where is the SIMD instruction type e.g. "sse" or "avx" + + +%macro simd_store_sse 5 + __simd_store %1,%2,%3,%4,%5,SSE +%endm + +%macro simd_store_avx 5 + __simd_store %1,%2,%3,%4,%5,AVX +%endm + +%macro simd_store_sse_15 5 + __simd_store %1,%2,%3,%4,%5,SSE,15 +%endm + +%macro simd_store_avx_15 5 + __simd_store %1,%2,%3,%4,%5,AVX,15 +%endm + +%macro __simd_store 6-7 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: src data (clobbered) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP %4 ; 64-bit temp GPR (clobbered) +%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) +%define %%SIMDTYPE %6 ; "SSE" or "AVX" +%define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16 + +%define %%PSRLDQ _PSRLDQ %%SIMDTYPE, + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%MOVQ movq +%else + %define %%MOVDQU vmovdqu + %define %%MOVQ vmovq +%endif + +;; determine max byte size for store operation +%if %0 > 6 +%assign max_length_to_store %%MAX_LEN +%else +%assign max_length_to_store 16 +%endif + +%if max_length_to_store > 16 +%error "__simd_store macro invoked with MAX_LEN bigger than 16!" +%endif + + xor %%IDX, %%IDX ; zero idx + +%if max_length_to_store == 16 + test %%SIZE, 16 + jz %%lt16 + %%MOVDQU [%%DST], %%SRC + jmp %%end +%%lt16: +%endif + +%if max_length_to_store >= 8 + test %%SIZE, 8 + jz %%lt8 + %%MOVQ [%%DST + %%IDX], %%SRC + %%PSRLDQ %%SRC, 8 + add %%IDX, 8 +%%lt8: +%endif + + %%MOVQ %%TMP, %%SRC ; use GPR from now on + +%if max_length_to_store >= 4 + test %%SIZE, 4 + jz %%lt4 + mov [%%DST + %%IDX], DWORD(%%TMP) + shr %%TMP, 32 + add %%IDX, 4 +%%lt4: +%endif + + test %%SIZE, 2 + jz %%lt2 + mov [%%DST + %%IDX], WORD(%%TMP) + shr %%TMP, 16 + add %%IDX, 2 +%%lt2: + test %%SIZE, 1 + jz %%end + mov [%%DST + %%IDX], BYTE(%%TMP) +%%end: +%endm + +; This section defines a series of macros to load small to medium amounts +; (from 0 to 16 bytes) of data from memory to SIMD registers, +; where the size is variable but limited. +; +; The macros are all called as: +; simd_load DST, SRC, SIZE +; with the parameters defined as: +; DST : register: destination XMM register +; SRC : register: pointer to src data (not modified) +; SIZE : register: length in bytes (not modified) +; +; The name indicates the options. The name is of the form: +; simd_load__ +; where: +; is either "sse" or "avx" +; is either "15" or "16" and defines largest value of SIZE +; is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; +; For example: +; simd_load_sse_16 : SSE, 0 <= size <= 16 +; simd_load_avx_15_1 : AVX, 1 <= size <= 15 + +%macro simd_load_sse_15_1 3 + __simd_load %1,%2,%3,0,0,SSE +%endm +%macro simd_load_sse_15 3 + __simd_load %1,%2,%3,1,0,SSE +%endm +%macro simd_load_sse_16_1 3 + __simd_load %1,%2,%3,0,1,SSE +%endm +%macro simd_load_sse_16 3 + __simd_load %1,%2,%3,1,1,SSE +%endm + +%macro simd_load_avx_15_1 3 + __simd_load %1,%2,%3,0,0,AVX +%endm +%macro simd_load_avx_15 3 + __simd_load %1,%2,%3,1,0,AVX +%endm +%macro simd_load_avx_16_1 3 + __simd_load %1,%2,%3,0,1,AVX +%endm +%macro simd_load_avx_16 3 + __simd_load %1,%2,%3,1,1,AVX +%endm + +%macro __simd_load 6 +%define %%DST %1 ; [out] destination XMM register +%define %%SRC %2 ; [in] pointer to src data +%define %%SIZE %3 ; [in] length in bytes (0-16 bytes) +%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0 +%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16 +%define %%SIMDTYPE %6 ; "SSE" or "AVX" + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%PINSRB pinsrb + %define %%PINSRQ pinsrq + %define %%PXOR pxor +%else + %define %%MOVDQU vmovdqu + %define %%PINSRB vpinsrb + %define %%PINSRQ vpinsrq + %define %%PXOR vpxor +%endif + +%if (%%ACCEPT_16 != 0) + test %%SIZE, 16 + jz %%_skip_16 + %%MOVDQU %%DST, [%%SRC] + jmp %%end_load + +%%_skip_16: +%endif + %%PXOR %%DST, %%DST ; clear XMM register +%if (%%ACCEPT_0 != 0) + or %%SIZE, %%SIZE + je %%end_load +%endif + cmp %%SIZE, 1 + je %%_size_1 + cmp %%SIZE, 2 + je %%_size_2 + cmp %%SIZE, 3 + je %%_size_3 + cmp %%SIZE, 4 + je %%_size_4 + cmp %%SIZE, 5 + je %%_size_5 + cmp %%SIZE, 6 + je %%_size_6 + cmp %%SIZE, 7 + je %%_size_7 + cmp %%SIZE, 8 + je %%_size_8 + cmp %%SIZE, 9 + je %%_size_9 + cmp %%SIZE, 10 + je %%_size_10 + cmp %%SIZE, 11 + je %%_size_11 + cmp %%SIZE, 12 + je %%_size_12 + cmp %%SIZE, 13 + je %%_size_13 + cmp %%SIZE, 14 + je %%_size_14 + +%%_size_15: + %%PINSRB %%DST, [%%SRC + 14], 14 +%%_size_14: + %%PINSRB %%DST, [%%SRC + 13], 13 +%%_size_13: + %%PINSRB %%DST, [%%SRC + 12], 12 +%%_size_12: + %%PINSRB %%DST, [%%SRC + 11], 11 +%%_size_11: + %%PINSRB %%DST, [%%SRC + 10], 10 +%%_size_10: + %%PINSRB %%DST, [%%SRC + 9], 9 +%%_size_9: + %%PINSRB %%DST, [%%SRC + 8], 8 +%%_size_8: + %%PINSRQ %%DST, [%%SRC], 0 + jmp %%end_load +%%_size_7: + %%PINSRB %%DST, [%%SRC + 6], 6 +%%_size_6: + %%PINSRB %%DST, [%%SRC + 5], 5 +%%_size_5: + %%PINSRB %%DST, [%%SRC + 4], 4 +%%_size_4: + %%PINSRB %%DST, [%%SRC + 3], 3 +%%_size_3: + %%PINSRB %%DST, [%%SRC + 2], 2 +%%_size_2: + %%PINSRB %%DST, [%%SRC + 1], 1 +%%_size_1: + %%PINSRB %%DST, [%%SRC + 0], 0 +%%end_load: +%endm + +%endif ; ifndef __MEMCPY_ASM__ diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h new file mode 100644 index 000000000..e0cc314d1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h @@ -0,0 +1,375 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file memcpy_inline.h + * @brief Defines intrinsic memcpy functions used by the new hashing API + * + */ + +#ifndef _MEMCPY_H_ +#define _MEMCPY_H_ + +#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \ + || defined(_M_IX86) +#include "intrinreg.h" +#endif +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \ + || defined(_M_IX86) + +#define memcpy_varlen memcpy_sse_varlen +#define memcpy_fixedlen memcpy_sse_fixedlen + +#define memclr_varlen memclr_sse_varlen +#define memclr_fixedlen memclr_sse_fixedlen + +static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes); +static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes); +static inline void memcpy_sse_fixedlen (void* dst, const void* src, size_t nbytes); + +static inline void memcpy_lte32_sse_varlen (void* dst, const void* src, size_t nbytes); +static inline void memcpy_gte16_sse_varlen (void* dst, const void* src, size_t nbytes); +static inline void memcpy_sse_varlen (void* dst, const void* src, size_t nbytes); + + +static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes); +static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes); +static inline void memclr_sse_fixedlen (void* dst, size_t nbytes); + +static inline void memclr_lte32_sse_varlen (void* dst, size_t nbytes); +static inline void memclr_gte16_sse_varlen (void* dst, size_t nbytes); +static inline void memclr_sse_varlen (void* dst, size_t nbytes); + +#define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \ + do { \ + intrinreg##N head; \ + intrinreg##N tail; \ + assert(N <= nbytes && nbytes <= 2*N); \ + if(N == 1 || (fixedwidth && nbytes==N) ) { \ + head = load_intrinreg##N(src); \ + store_intrinreg##N(dst, head); \ + } \ + else { \ + head = load_intrinreg##N(src); \ + tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \ + store_intrinreg##N(dst, head); \ + store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \ + } \ + } while(0) + +#define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \ + do { \ + const intrinreg##N zero = {0}; \ + assert(N <= nbytes && nbytes <= 2*N); \ + if(N == 1 || (fixedwidth && nbytes==N) ) { \ + store_intrinreg##N(dst, zero); \ + } \ + else { \ + store_intrinreg##N(dst, zero); \ + store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \ + } \ + } while(0) + +// Define load/store functions uniformly. + +#define load_intrinreg16(src) _mm_loadu_ps((const float*) src) +#define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val) + +static inline intrinreg8 load_intrinreg8(const void *src) +{ + return *(intrinreg8 *) src; +} + +static inline void store_intrinreg8(void *dst, intrinreg8 val) +{ + *(intrinreg8 *) dst = val; +} + +static inline intrinreg4 load_intrinreg4(const void *src) +{ + return *(intrinreg4 *) src; +} + +static inline void store_intrinreg4(void *dst, intrinreg4 val) +{ + *(intrinreg4 *) dst = val; +} + +static inline intrinreg2 load_intrinreg2(const void *src) +{ + return *(intrinreg2 *) src; +} + +static inline void store_intrinreg2(void *dst, intrinreg2 val) +{ + *(intrinreg2 *) dst = val; +} + +static inline intrinreg1 load_intrinreg1(const void *src) +{ + return *(intrinreg1 *) src; +} + +static inline void store_intrinreg1(void *dst, intrinreg1 val) +{ + *(intrinreg1 *) dst = val; +} + +static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes) +{ + size_t i; + size_t j; + intrinreg16 pool[4]; + size_t remaining_moves; + size_t tail_offset; + int do_tail; + assert(nbytes >= 16); + + for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) { + for (j = 0; j < 4; j++) + pool[j] = + load_intrinreg16((const void *)((const char *)src + i + 16 * j)); + for (j = 0; j < 4; j++) + store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]); + } + + remaining_moves = (nbytes - i) / 16; + tail_offset = nbytes - 16; + do_tail = (tail_offset & (16 - 1)); + + for (j = 0; j < remaining_moves; j++) + pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j)); + + if (do_tail) + pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset)); + + for (j = 0; j < remaining_moves; j++) + store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]); + + if (do_tail) + store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]); +} + +static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes) +{ + size_t i; + size_t j; + const intrinreg16 zero = { 0 }; + size_t remaining_moves; + size_t tail_offset; + int do_tail; + assert(nbytes >= 16); + + for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) + for (j = 0; j < 4; j++) + store_intrinreg16((void *)((char *)dst + i + 16 * j), zero); + + remaining_moves = (nbytes - i) / 16; + tail_offset = nbytes - 16; + do_tail = (tail_offset & (16 - 1)); + + for (j = 0; j < remaining_moves; j++) + store_intrinreg16((void *)((char *)dst + i + 16 * j), zero); + + if (do_tail) + store_intrinreg16((void *)((char *)dst + tail_offset), zero); +} + +static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes) +{ + assert(nbytes <= 32); + if (nbytes >= 16) + MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes); + else if (nbytes >= 8) + MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes); + else if (nbytes >= 4) + MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes); + else if (nbytes >= 2) + MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes); + else if (nbytes >= 1) + MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes); +} + +static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes) +{ + assert(nbytes <= 32); + if (nbytes >= 16) + MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes); + else if (nbytes >= 8) + MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes); + else if (nbytes >= 4) + MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes); + else if (nbytes >= 2) + MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes); + else if (nbytes >= 1) + MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes); +} + +static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes) +{ + assert(nbytes <= 32); + if (nbytes >= 16) + MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes); + else if (nbytes >= 8) + MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes); + else if (nbytes >= 4) + MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes); + else if (nbytes >= 2) + MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes); + else if (nbytes >= 1) + MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes); +} + +static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes) +{ + assert(nbytes <= 32); + if (nbytes >= 16) + MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes); + else if (nbytes >= 8) + MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes); + else if (nbytes >= 4) + MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes); + else if (nbytes >= 2) + MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes); + else if (nbytes >= 1) + MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes); +} + +static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes) +{ + size_t i = 0; + intrinreg16 tail; + + assert(nbytes >= 16); + + while (i + 128 <= nbytes) { + memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), + (const void *)((const char *)src + i), 128); + i += 128; + } + if (i + 64 <= nbytes) { + memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), + (const void *)((const char *)src + i), 64); + i += 64; + } + if (i + 32 <= nbytes) { + memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), + (const void *)((const char *)src + i), 32); + i += 32; + } + if (i + 16 <= nbytes) { + memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), + (const void *)((const char *)src + i), 16); + } + + i = nbytes - 16; + tail = load_intrinreg16((const void *)((const char *)src + i)); + store_intrinreg16((void *)((char *)dst + i), tail); +} + +static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes) +{ + size_t i = 0; + const intrinreg16 zero = { 0 }; + + assert(nbytes >= 16); + + while (i + 128 <= nbytes) { + memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128); + i += 128; + } + if (i + 64 <= nbytes) { + memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64); + i += 64; + } + if (i + 32 <= nbytes) { + memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32); + i += 32; + } + if (i + 16 <= nbytes) { + memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16); + } + + i = nbytes - 16; + store_intrinreg16((void *)((char *)dst + i), zero); +} + +static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes) +{ + if (nbytes >= 16) + memcpy_gte16_sse_fixedlen(dst, src, nbytes); + else + memcpy_lte32_sse_fixedlen(dst, src, nbytes); +} + +static inline void memclr_sse_fixedlen(void *dst, size_t nbytes) +{ + if (nbytes >= 16) + memclr_gte16_sse_fixedlen(dst, nbytes); + else + memclr_lte32_sse_fixedlen(dst, nbytes); +} + +static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes) +{ + if (nbytes >= 16) + memcpy_gte16_sse_varlen(dst, src, nbytes); + else + memcpy_lte32_sse_varlen(dst, src, nbytes); +} + +static inline void memclr_sse_varlen(void *dst, size_t nbytes) +{ + if (nbytes >= 16) + memclr_gte16_sse_varlen(dst, nbytes); + else + memclr_lte32_sse_varlen(dst, nbytes); +} +#else +#define memcpy_varlen memcpy +#define memcpy_fixedlen memcpy + +#define memclr_varlen(dst,n) memset(dst,0,n) +#define memclr_fixedlen(dst,n) memset(dst,0,n) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // __MEMCPY_H diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h new file mode 100644 index 000000000..eac3be031 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h @@ -0,0 +1,315 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA1_H_ +#define _MH_SHA1_H_ + +/** + * @file mh_sha1.h + * @brief mh_sha1 function prototypes and structures + * + * Interface for mh_sha1 functions + * + * mh_sha1 Init-Update..Update-Finalize + * + * This file defines the interface to optimized functions used in mh_sha1. + * The definition of multi-hash SHA1(mh_sha1, for short) is: Pad the buffer + * in SHA1 style until the total length is a multiple of 4*16*16 + * (words-width * parallel-segments * block-size); Hash the buffer in + * parallel, generating digests of 4*16*5 (words-width*parallel-segments* + * digest-size); Treat the set of digests as another data buffer, and + * generate a final SHA1 digest for it. + * + * + * Example + * \code + * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; + * struct mh_sha1_ctx *ctx; + * + * ctx = malloc(sizeof(struct mh_sha1_ctx)); + * mh_sha1_init(ctx); + * mh_sha1_update(ctx, buff, block_len); + * mh_sha1_finalize(ctx, mh_sha1_digest); + * \endcode + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +// External Interface Definition +#define HASH_SEGS 16 +#define SHA1_BLOCK_SIZE 64 +#define MH_SHA1_BLOCK_SIZE (HASH_SEGS * SHA1_BLOCK_SIZE) +#define SHA1_DIGEST_WORDS 5 +#define AVX512_ALIGNED 64 + +/** @brief Holds info describing a single mh_sha1 + * + * It is better to use heap to allocate this data structure to avoid stack overflow. + * +*/ +struct mh_sha1_ctx { + uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1 + + uint64_t total_length; + //!< Parameters for update feature, describe the lengths of input buffers in bytes + uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2]; + //!< Padding the tail of input data for SHA1 + uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS]; + //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose. + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED]; + //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose. +}; + +/** + * @enum mh_sha1_ctx_error + * @brief CTX error flags + */ +enum mh_sha1_ctx_error{ + MH_SHA1_CTX_ERROR_NONE = 0, //!< MH_SHA1_CTX_ERROR_NONE + MH_SHA1_CTX_ERROR_NULL = -1, //!< MH_SHA1_CTX_ERROR_NULL +}; + + +/******************************************************************* + * mh_sha1 API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the mh_sha1_ctx structure. + * + * @param ctx Structure holding mh_sha1 info + * @returns int Return 0 if the function runs without errors + */ +int mh_sha1_init (struct mh_sha1_ctx* ctx); + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + */ +int mh_sha1_update (struct mh_sha1_ctx * ctx, const void* buffer, uint32_t len); + +/** + * @brief Finalize the message digests for multi-hash sha1. + * + * Place the message digest in mh_sha1_digest which must have enough space + * for the outputs. + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + */ +int mh_sha1_finalize (struct mh_sha1_ctx* ctx, void* mh_sha1_digest); + +/******************************************************************* + * multi-types of mh_sha1 internal API + * + * XXXX The multi-binary version + * XXXX_base The C code version which used to display the algorithm + * XXXX_sse The version uses a ASM function optimized for SSE + * XXXX_avx The version uses a ASM function optimized for AVX + * XXXX_avx2 The version uses a ASM function optimized for AVX2 + * XXXX_avx512 The version uses a ASM function optimized for AVX512 + * + ******************************************************************/ + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * Base update() function that does not require SIMD support. + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_update_base (struct mh_sha1_ctx* ctx, const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires SSE + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_update_sse (struct mh_sha1_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_update_avx (struct mh_sha1_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX2 + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_update_avx2 (struct mh_sha1_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha1 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX512 + * + * @param ctx Structure holding mh_sha1 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_update_avx512 (struct mh_sha1_ctx * ctx, + const void* buffer, uint32_t len); + + +/** + * @brief Finalize the message digests for multi-hash sha1. + * + * Place the message digests in mh_sha1_digest, + * which must have enough space for the outputs. + * Base Finalize() function that does not require SIMD support. + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_finalize_base (struct mh_sha1_ctx* ctx, + void* mh_sha1_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha1_digest which must have enough space + * for the outputs. + * + * @requires SSE + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_finalize_sse (struct mh_sha1_ctx* ctx, + void* mh_sha1_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha1_digest which must have enough space + * for the outputs. + * + * @requires AVX + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_finalize_avx (struct mh_sha1_ctx* ctx, + void* mh_sha1_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha1_digest which must have enough space + * for the outputs. + * + * @requires AVX2 + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_finalize_avx2 (struct mh_sha1_ctx* ctx, + void* mh_sha1_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha1_digest which must have enough space + * for the outputs. + * + * @requires AVX512 + * + * @param ctx Structure holding mh_sha1 info + * @param mh_sha1_digest The digest of mh_sha1 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha1_finalize_avx512 (struct mh_sha1_ctx* ctx, + void* mh_sha1_digest); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h new file mode 100644 index 000000000..1c07306ec --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h @@ -0,0 +1,327 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA1_MURMUR3_X64_128_H_ +#define _MH_SHA1_MURMUR3_X64_128_H_ + +/** + * @file mh_sha1_murmur3_x64_128.h + * @brief mh_sha1_murmur3_x64_128 function prototypes and structures + * + * Interface for mh_sha1_murmur3_x64_128 functions + * + * mh_sha1_murmur3_x64_128 Init-Update..Update-Finalize + * + * This file defines the interface to optimized functions used in mh_sha1 and + * mh_sha1_murmur3_x64_128. The definition of multi-hash SHA1(mh_sha1, + * for short) is: Pad the buffer in SHA1 style until the total length is a multiple + * of 4*16*16(words-width * parallel-segments * block-size); Hash the buffer + * in parallel, generating digests of 4*16*5 (words-width*parallel-segments* + * digest-size); Treat the set of digests as another data buffer, and generate + * a final SHA1 digest for it. mh_sha1_murmur3_x64_128 is a stitching function + * which will get a murmur3_x64_128 digest while generate mh_sha1 digest. + * + * + * Example + * \code + * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; + * uint32_t murmur_digest[MURMUR3_x64_128_DIGEST_WORDS]; + * struct mh_sha1_murmur3_x64_128_ctx *ctx; + * + * ctx = malloc(sizeof(struct mh_sha1_murmur3_x64_128_ctx)); + * mh_sha1_murmur3_x64_128_init(ctx, 0); + * mh_sha1_murmur3_x64_128_update(ctx, buff, block_len); + * mh_sha1_murmur3_x64_128_finalize(ctx, mh_sha1_digest, + * murmur_digest); + * \endcode + */ + +#include +#include "mh_sha1.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +// External Interface Definition +// Add murmur3_x64_128 definition +#define MUR_BLOCK_SIZE (2 * sizeof(uint64_t)) +#define MURMUR3_x64_128_DIGEST_WORDS 4 + +/** @brief Holds info describing a single mh_sha1_murmur3_x64_128 + * + * It is better to use heap to allocate this data structure to avoid stack overflow. + * +*/ +struct mh_sha1_murmur3_x64_128_ctx { + uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1 + uint32_t murmur3_x64_128_digest[MURMUR3_x64_128_DIGEST_WORDS]; //!< the digest of murmur3_x64_128 + + uint64_t total_length; + //!< Parameters for update feature, describe the lengths of input buffers in bytes + uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2]; + //!< Padding the tail of input data for SHA1 + uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS]; + //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose. + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED]; + //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose. +}; + +/** + * @enum mh_sha1_murmur3_ctx_error + * @brief CTX error flags + */ +enum mh_sha1_murmur3_ctx_error{ + MH_SHA1_MURMUR3_CTX_ERROR_NONE = 0, //!< MH_SHA1_MURMUR3_CTX_ERROR_NONE + MH_SHA1_MURMUR3_CTX_ERROR_NULL = -1, //! mh_sha256 Init-Update..Update-Finalize + * + * This file defines the interface to optimized functions used in mh_sha256. + * The definition of multi-hash SHA256(mh_sha256, for short) is: Pad the buffer + * in SHA256 style until the total length is a multiple of 4*16*16 + * (words-width * parallel-segments * block-size); Hash the buffer in + * parallel, generating digests of 4*16*8 (words-width*parallel-segments* + * digest-size); Treat the set of digests as another data buffer, and + * generate a final SHA256 digest for it. + * + * + * Example + * \code + * uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]; + * struct mh_sha256_ctx *ctx; + * + * ctx = malloc(sizeof(struct mh_sha256_ctx)); + * mh_sha256_init(ctx); + * mh_sha256_update(ctx, buff, block_len); + * mh_sha256_finalize(ctx, mh_sha256_digest); + * \endcode + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +// External Interface Definition +#define HASH_SEGS 16 +#define SHA256_BLOCK_SIZE 64 +#define MH_SHA256_BLOCK_SIZE (HASH_SEGS * SHA256_BLOCK_SIZE) +#define SHA256_DIGEST_WORDS 8 +#define AVX512_ALIGNED 64 + +/** @brief Holds info describing a single mh_sha256 + * + * It is better to use heap to allocate this data structure to avoid stack overflow. + * +*/ +struct mh_sha256_ctx { + uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]; //!< the digest of multi-hash SHA256 + + uint64_t total_length; + //!< Parameters for update feature, describe the lengths of input buffers in bytes + uint8_t partial_block_buffer [MH_SHA256_BLOCK_SIZE * 2]; + //!< Padding the tail of input data for SHA256 + uint8_t mh_sha256_interim_digests[sizeof(uint32_t) * SHA256_DIGEST_WORDS * HASH_SEGS]; + //!< Storing the SHA256 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose. + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE + AVX512_ALIGNED]; + //!< Re-structure sha256 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose. +}; + +/** + * @enum mh_sha256_ctx_error + * @brief CTX error flags + */ +enum mh_sha256_ctx_error{ + MH_SHA256_CTX_ERROR_NONE = 0, //!< MH_SHA256_CTX_ERROR_NONE + MH_SHA256_CTX_ERROR_NULL = -1, //!< MH_SHA256_CTX_ERROR_NULL +}; + + +/******************************************************************* + * mh_sha256 API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the mh_sha256_ctx structure. + * + * @param ctx Structure holding mh_sha256 info + * @returns int Return 0 if the function runs without errors + */ +int mh_sha256_init (struct mh_sha256_ctx* ctx); + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + */ +int mh_sha256_update (struct mh_sha256_ctx * ctx, const void* buffer, uint32_t len); + +/** + * @brief Finalize the message digests for multi-hash sha256. + * + * Place the message digest in mh_sha256_digest which must have enough space + * for the outputs. + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + */ +int mh_sha256_finalize (struct mh_sha256_ctx* ctx, void* mh_sha256_digest); + +/******************************************************************* + * multi-types of mh_sha256 internal API + * + * XXXX The multi-binary version + * XXXX_base The C code version which used to display the algorithm + * XXXX_sse The version uses a ASM function optimized for SSE + * XXXX_avx The version uses a ASM function optimized for AVX + * XXXX_avx2 The version uses a ASM function optimized for AVX2 + * XXXX_avx512 The version uses a ASM function optimized for AVX512 + * + ******************************************************************/ + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * Base update() function that does not require SIMD support. + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_update_base (struct mh_sha256_ctx* ctx, const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires SSE + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_update_sse (struct mh_sha256_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_update_avx (struct mh_sha256_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX2 + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_update_avx2 (struct mh_sha256_ctx * ctx, + const void* buffer, uint32_t len); + +/** + * @brief Multi-hash sha256 update. + * + * Can be called repeatedly to update hashes with new input data. + * @requires AVX512 + * + * @param ctx Structure holding mh_sha256 info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_update_avx512 (struct mh_sha256_ctx * ctx, + const void* buffer, uint32_t len); + + +/** + * @brief Finalize the message digests for multi-hash sha256. + * + * Place the message digests in mh_sha256_digest, + * which must have enough space for the outputs. + * Base Finalize() function that does not require SIMD support. + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_finalize_base (struct mh_sha256_ctx* ctx, + void* mh_sha256_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha256_digest which must have enough space + * for the outputs. + * + * @requires SSE + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_finalize_sse (struct mh_sha256_ctx* ctx, + void* mh_sha256_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha256_digest which must have enough space + * for the outputs. + * + * @requires AVX + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_finalize_avx (struct mh_sha256_ctx* ctx, + void* mh_sha256_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha256_digest which must have enough space + * for the outputs. + * + * @requires AVX2 + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_finalize_avx2 (struct mh_sha256_ctx* ctx, + void* mh_sha256_digest); + +/** + * @brief Finalize the message digests for combined multi-hash and murmur. + * + * Place the message digest in mh_sha256_digest which must have enough space + * for the outputs. + * + * @requires AVX512 + * + * @param ctx Structure holding mh_sha256 info + * @param mh_sha256_digest The digest of mh_sha256 + * @returns int Return 0 if the function runs without errors + * + */ +int mh_sha256_finalize_avx512 (struct mh_sha256_ctx* ctx, + void* mh_sha256_digest); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h new file mode 100644 index 000000000..ac88f7b0a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h @@ -0,0 +1,112 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MULTI_BUFFER_H_ +#define _MULTI_BUFFER_H_ + +/** + * @file multi_buffer.h + * @brief Multi-buffer common fields + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @enum JOB_STS + * @brief Job return codes + */ + +typedef enum {STS_UNKNOWN = 0, //!< STS_UNKNOWN + STS_BEING_PROCESSED = 1,//!< STS_BEING_PROCESSED + STS_COMPLETED = 2, //!< STS_COMPLETED + STS_INTERNAL_ERROR, //!< STS_INTERNAL_ERROR + STS_ERROR //!< STS_ERROR +} JOB_STS; + +#define HASH_MB_NO_FLAGS 0 +#define HASH_MB_FIRST 1 +#define HASH_MB_LAST 2 + +/* Common flags for the new API only + * */ + +/** + * @enum HASH_CTX_FLAG + * @brief CTX job type + */ +typedef enum { + HASH_UPDATE = 0x00, //!< HASH_UPDATE + HASH_FIRST = 0x01, //!< HASH_FIRST + HASH_LAST = 0x02, //!< HASH_LAST + HASH_ENTIRE = 0x03, //!< HASH_ENTIRE +} HASH_CTX_FLAG; + +/** + * @enum HASH_CTX_STS + * @brief CTX status flags + */ +typedef enum { + HASH_CTX_STS_IDLE = 0x00, //!< HASH_CTX_STS_IDLE + HASH_CTX_STS_PROCESSING = 0x01, //!< HASH_CTX_STS_PROCESSING + HASH_CTX_STS_LAST = 0x02, //!< HASH_CTX_STS_LAST + HASH_CTX_STS_COMPLETE = 0x04, //!< HASH_CTX_STS_COMPLETE +} HASH_CTX_STS; + +/** + * @enum HASH_CTX_ERROR + * @brief CTX error flags + */ +typedef enum { + HASH_CTX_ERROR_NONE = 0, //!< HASH_CTX_ERROR_NONE + HASH_CTX_ERROR_INVALID_FLAGS = -1, //!< HASH_CTX_ERROR_INVALID_FLAGS + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, //!< HASH_CTX_ERROR_ALREADY_PROCESSING + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, //!< HASH_CTX_ERROR_ALREADY_COMPLETED +} HASH_CTX_ERROR; + + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while(0) + +#ifdef __cplusplus +} +#endif + +#endif // _MULTI_BUFFER_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm new file mode 100644 index 000000000..4dd019319 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm @@ -0,0 +1,517 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _MULTIBINARY_ASM_ +%define _MULTIBINARY_ASM_ + +%ifidn __OUTPUT_FORMAT__, elf32 + %define mbin_def_ptr dd + %define mbin_ptr_sz dword + %define mbin_rdi edi + %define mbin_rsi esi + %define mbin_rax eax + %define mbin_rbx ebx + %define mbin_rcx ecx + %define mbin_rdx edx +%else + %define mbin_def_ptr dq + %define mbin_ptr_sz qword + %define mbin_rdi rdi + %define mbin_rsi rsi + %define mbin_rax rax + %define mbin_rbx rbx + %define mbin_rcx rcx + %define mbin_rdx rdx +%endif + +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + +;;;; +; multibinary macro: +; creates the visable entry point that uses HW optimized call pointer +; creates the init of the HW optimized call pointer +;;;; +%macro mbin_interface 1 + ;;;; + ; *_dispatched is defaulted to *_mbinit and replaced on first call. + ; Therefore, *_dispatch_init is only executed on first call. + ;;;; + section .data + %1_dispatched: + mbin_def_ptr %1_mbinit + + section .text + mk_global %1, function + %1_mbinit: + ;;; only called the first time to setup hardware match + call %1_dispatch_init + ;;; falls thru to execute the hw optimized code + %1: + jmp mbin_ptr_sz [%1_dispatched] +%endmacro + +;;;;; +; mbin_dispatch_init parameters +; Use this function when SSE/00/01 is a minimum requirement +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +;;;;; +%macro mbin_dispatch_init 4 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init2 parameters +; Cases where only base functions are available +; 1-> function name +; 2-> base function +;;;;; +%macro mbin_dispatch_init2 2 + section .text + %1_dispatch_init: + push mbin_rsi + lea mbin_rsi, [%2 WRT_OPT] ; Default + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init5 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_1 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +;;;;; +%macro mbin_dispatch_init5 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + ; Test for SSE4.1 + test ecx, FLAG_CPUID1_ECX_SSE4_1 + lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func + cmovne mbin_rsi, mbin_rbx + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%3 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +%if AS_FEATURE_LEVEL >= 6 +;;;;; +; mbin_dispatch_init6 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_1 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +;;;;; +%macro mbin_dispatch_init6 6 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_1 + je _%1_init_done ; Use base function if no SSE4_1 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +%else +%macro mbin_dispatch_init6 6 + mbin_dispatch_init5 %1, %2, %3, %4, %5 +%endmacro +%endif + +%if AS_FEATURE_LEVEL >= 10 +;;;;; +; mbin_dispatch_init7 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVX512 Update/10 opt func +;;;;; +%macro mbin_dispatch_init7 7 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro +%else +%macro mbin_dispatch_init7 7 + mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 +%endmacro +%endif + +;;;;; +; mbin_dispatch_sse_to_avx2_shani parameters +; derived from mbin_dispatch_init +; Use this function when SSE/00/01 is a minimum requirement +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +; 5-> SHANI opt for GLM +;;;;; +%macro mbin_dispatch_sse_to_avx2_shani 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_shani_check ; AVX is not available so check shani + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret + + _%1_shani_check: + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_SHA + lea mbin_rbx, [%5 WRT_OPT] ; SHANI opt func + cmovne mbin_rsi, mbin_rbx + jmp _%1_init_done ; end +%endmacro + +;;;;; +; mbin_dispatch_base_to_avx512_shani parameters +; derived from mbin_dispatch_init6 +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> SHANI opt for GLM +; 8-> SHANI opt for CNL +;;;;; +%macro mbin_dispatch_base_to_avx512_shani 8 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_shani_check + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_shani_check + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_shani_check + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + ;; Test for SHANI + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_SHA + lea mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func + cmovne mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret + + _%1_shani_check: + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_SHA + lea mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func + cmovne mbin_rsi, mbin_rbx + jmp _%1_init_done ; end +%endmacro + + + +%endif ; ifndef _MULTIBINARY_ASM_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm new file mode 100644 index 000000000..717dd0503 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm @@ -0,0 +1,442 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + +%define EFLAGS_HAS_CPUID (1<<21) +%define FLAG_CPUID1_ECX_CLMUL (1<<1) +%define FLAG_CPUID1_EDX_SSE2 (1<<26) +%define FLAG_CPUID1_ECX_SSE3 (1) +%define FLAG_CPUID1_ECX_SSE4_1 (1<<19) +%define FLAG_CPUID1_ECX_SSE4_2 (1<<20) +%define FLAG_CPUID1_ECX_POPCNT (1<<23) +%define FLAG_CPUID1_ECX_AESNI (1<<25) +%define FLAG_CPUID1_ECX_OSXSAVE (1<<27) +%define FLAG_CPUID1_ECX_AVX (1<<28) +%define FLAG_CPUID1_EBX_AVX2 (1<<5) + +%define FLAG_CPUID7_EBX_AVX2 (1<<5) +%define FLAG_CPUID7_EBX_AVX512F (1<<16) +%define FLAG_CPUID7_EBX_AVX512DQ (1<<17) +%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21) +%define FLAG_CPUID7_EBX_AVX512PF (1<<26) +%define FLAG_CPUID7_EBX_AVX512ER (1<<27) +%define FLAG_CPUID7_EBX_AVX512CD (1<<28) +%define FLAG_CPUID7_EBX_SHA (1<<29) +%define FLAG_CPUID7_EBX_AVX512BW (1<<30) +%define FLAG_CPUID7_EBX_AVX512VL (1<<31) + +%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) +%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6) +%define FLAG_CPUID7_ECX_GFNI (1 << 8) +%define FLAG_CPUID7_ECX_VAES (1 << 9) +%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10) +%define FLAG_CPUID7_ECX_VNNI (1 << 11) +%define FLAG_CPUID7_ECX_BITALG (1 << 12) +%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14) + +%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) +%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ) + +%define FLAG_XGETBV_EAX_XMM (1<<1) +%define FLAG_XGETBV_EAX_YMM (1<<2) +%define FLAG_XGETBV_EAX_XMM_YMM 0x6 +%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0 + +%define FLAG_CPUID1_EAX_AVOTON 0x000406d0 +%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0 + +; define d and w variants for registers + +%define raxd eax +%define raxw ax +%define raxb al + +%define rbxd ebx +%define rbxw bx +%define rbxb bl + +%define rcxd ecx +%define rcxw cx +%define rcxb cl + +%define rdxd edx +%define rdxw dx +%define rdxb dl + +%define rsid esi +%define rsiw si +%define rsib sil + +%define rdid edi +%define rdiw di +%define rdib dil + +%define rbpd ebp +%define rbpw bp +%define rbpb bpl + +%define zmm0x xmm0 +%define zmm1x xmm1 +%define zmm2x xmm2 +%define zmm3x xmm3 +%define zmm4x xmm4 +%define zmm5x xmm5 +%define zmm6x xmm6 +%define zmm7x xmm7 +%define zmm8x xmm8 +%define zmm9x xmm9 +%define zmm10x xmm10 +%define zmm11x xmm11 +%define zmm12x xmm12 +%define zmm13x xmm13 +%define zmm14x xmm14 +%define zmm15x xmm15 +%define zmm16x xmm16 +%define zmm17x xmm17 +%define zmm18x xmm18 +%define zmm19x xmm19 +%define zmm20x xmm20 +%define zmm21x xmm21 +%define zmm22x xmm22 +%define zmm23x xmm23 +%define zmm24x xmm24 +%define zmm25x xmm25 +%define zmm26x xmm26 +%define zmm27x xmm27 +%define zmm28x xmm28 +%define zmm29x xmm29 +%define zmm30x xmm30 +%define zmm31x xmm31 + +%define ymm0x xmm0 +%define ymm1x xmm1 +%define ymm2x xmm2 +%define ymm3x xmm3 +%define ymm4x xmm4 +%define ymm5x xmm5 +%define ymm6x xmm6 +%define ymm7x xmm7 +%define ymm8x xmm8 +%define ymm9x xmm9 +%define ymm10x xmm10 +%define ymm11x xmm11 +%define ymm12x xmm12 +%define ymm13x xmm13 +%define ymm14x xmm14 +%define ymm15x xmm15 +%define ymm16x xmm16 +%define ymm17x xmm17 +%define ymm18x xmm18 +%define ymm19x xmm19 +%define ymm20x xmm20 +%define ymm21x xmm21 +%define ymm22x xmm22 +%define ymm23x xmm23 +%define ymm24x xmm24 +%define ymm25x xmm25 +%define ymm26x xmm26 +%define ymm27x xmm27 +%define ymm28x xmm28 +%define ymm29x xmm29 +%define ymm30x xmm30 +%define ymm31x xmm31 + +%define xmm0x xmm0 +%define xmm1x xmm1 +%define xmm2x xmm2 +%define xmm3x xmm3 +%define xmm4x xmm4 +%define xmm5x xmm5 +%define xmm6x xmm6 +%define xmm7x xmm7 +%define xmm8x xmm8 +%define xmm9x xmm9 +%define xmm10x xmm10 +%define xmm11x xmm11 +%define xmm12x xmm12 +%define xmm13x xmm13 +%define xmm14x xmm14 +%define xmm15x xmm15 +%define xmm16x xmm16 +%define xmm17x xmm17 +%define xmm18x xmm18 +%define xmm19x xmm19 +%define xmm20x xmm20 +%define xmm21x xmm21 +%define xmm22x xmm22 +%define xmm23x xmm23 +%define xmm24x xmm24 +%define xmm25x xmm25 +%define xmm26x xmm26 +%define xmm27x xmm27 +%define xmm28x xmm28 +%define xmm29x xmm29 +%define xmm30x xmm30 +%define xmm31x xmm31 + +%define zmm0y ymm0 +%define zmm1y ymm1 +%define zmm2y ymm2 +%define zmm3y ymm3 +%define zmm4y ymm4 +%define zmm5y ymm5 +%define zmm6y ymm6 +%define zmm7y ymm7 +%define zmm8y ymm8 +%define zmm9y ymm9 +%define zmm10y ymm10 +%define zmm11y ymm11 +%define zmm12y ymm12 +%define zmm13y ymm13 +%define zmm14y ymm14 +%define zmm15y ymm15 +%define zmm16y ymm16 +%define zmm17y ymm17 +%define zmm18y ymm18 +%define zmm19y ymm19 +%define zmm20y ymm20 +%define zmm21y ymm21 +%define zmm22y ymm22 +%define zmm23y ymm23 +%define zmm24y ymm24 +%define zmm25y ymm25 +%define zmm26y ymm26 +%define zmm27y ymm27 +%define zmm28y ymm28 +%define zmm29y ymm29 +%define zmm30y ymm30 +%define zmm31y ymm31 + +%define xmm0y ymm0 +%define xmm1y ymm1 +%define xmm2y ymm2 +%define xmm3y ymm3 +%define xmm4y ymm4 +%define xmm5y ymm5 +%define xmm6y ymm6 +%define xmm7y ymm7 +%define xmm8y ymm8 +%define xmm9y ymm9 +%define xmm10y ymm10 +%define xmm11y ymm11 +%define xmm12y ymm12 +%define xmm13y ymm13 +%define xmm14y ymm14 +%define xmm15y ymm15 +%define xmm16y ymm16 +%define xmm17y ymm17 +%define xmm18y ymm18 +%define xmm19y ymm19 +%define xmm20y ymm20 +%define xmm21y ymm21 +%define xmm22y ymm22 +%define xmm23y ymm23 +%define xmm24y ymm24 +%define xmm25y ymm25 +%define xmm26y ymm26 +%define xmm27y ymm27 +%define xmm28y ymm28 +%define xmm29y ymm29 +%define xmm30y ymm30 +%define xmm31y ymm31 + +%define xmm0z zmm0 +%define xmm1z zmm1 +%define xmm2z zmm2 +%define xmm3z zmm3 +%define xmm4z zmm4 +%define xmm5z zmm5 +%define xmm6z zmm6 +%define xmm7z zmm7 +%define xmm8z zmm8 +%define xmm9z zmm9 +%define xmm10z zmm10 +%define xmm11z zmm11 +%define xmm12z zmm12 +%define xmm13z zmm13 +%define xmm14z zmm14 +%define xmm15z zmm15 +%define xmm16z zmm16 +%define xmm17z zmm17 +%define xmm18z zmm18 +%define xmm19z zmm19 +%define xmm20z zmm20 +%define xmm21z zmm21 +%define xmm22z zmm22 +%define xmm23z zmm23 +%define xmm24z zmm24 +%define xmm25z zmm25 +%define xmm26z zmm26 +%define xmm27z zmm27 +%define xmm28z zmm28 +%define xmm29z zmm29 +%define xmm30z zmm30 +%define xmm31z zmm31 + +%define ymm0z zmm0 +%define ymm1z zmm1 +%define ymm2z zmm2 +%define ymm3z zmm3 +%define ymm4z zmm4 +%define ymm5z zmm5 +%define ymm6z zmm6 +%define ymm7z zmm7 +%define ymm8z zmm8 +%define ymm9z zmm9 +%define ymm10z zmm10 +%define ymm11z zmm11 +%define ymm12z zmm12 +%define ymm13z zmm13 +%define ymm14z zmm14 +%define ymm15z zmm15 +%define ymm16z zmm16 +%define ymm17z zmm17 +%define ymm18z zmm18 +%define ymm19z zmm19 +%define ymm20z zmm20 +%define ymm21z zmm21 +%define ymm22z zmm22 +%define ymm23z zmm23 +%define ymm24z zmm24 +%define ymm25z zmm25 +%define ymm26z zmm26 +%define ymm27z zmm27 +%define ymm28z zmm28 +%define ymm29z zmm29 +%define ymm30z zmm30 +%define ymm31z zmm31 + +%define DWORD(reg) reg %+ d +%define WORD(reg) reg %+ w +%define BYTE(reg) reg %+ b + +%define XWORD(reg) reg %+ x +%define YWORD(reg) reg %+ y +%define ZWORD(reg) reg %+ z + +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,elf64 + %define __x86_64__ +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,win64 + %define __x86_64__ +%endif +%ifidn __OUTPUT_FORMAT__,macho64 + %define __x86_64__ +%endif + +%ifdef __x86_64__ + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa +%else + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb +%endif + +%ifdef REL_TEXT + %define WRT_OPT +%elifidn __OUTPUT_FORMAT__, elf64 + %define WRT_OPT wrt ..plt +%else + %define WRT_OPT +%endif + +%macro mk_global 1-3 + %ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, macho64 + global %1 + %elifidn __OUTPUT_FORMAT__, win64 + global %1 + %else + global %1:%2 %3 + %endif + %else + global %1:%2 %3 + %endif +%endmacro + + +; Fixes for nasm lack of MS proc helpers +%ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, win64 + %macro alloc_stack 1 + sub rsp, %1 + %endmacro + + %macro proc_frame 1 + %1: + %endmacro + + %macro save_xmm128 2 + movdqa [rsp + %2], %1 + %endmacro + + %macro save_reg 2 + mov [rsp + %2], %1 + %endmacro + + %macro rex_push_reg 1 + push %1 + %endmacro + + %macro push_reg 1 + push %1 + %endmacro + + %define end_prolog + %endif + + %define endproc_frame +%endif + +%ifidn __OUTPUT_FORMAT__, macho64 + %define elf64 macho64 + mac_equ equ 1 +%endif + +%macro slversion 4 + section .text + global %1_slver_%2%3%4 + global %1_slver + %1_slver: + %1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro + +%endif ; ifndef _REG_SIZES_ASM_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h new file mode 100644 index 000000000..035cf1701 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h @@ -0,0 +1,114 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/** + * @file rolling_hashx.h + * @brief Fingerprint functions based on rolling hash + * + * rolling_hash2 - checks hash in a sliding window based on random 64-bit hash. + */ + +#ifndef _ROLLING_HASHX_H_ +#define _ROLLING_HASHX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + *@brief rolling hash return values + */ +enum { + FINGERPRINT_RET_HIT = 0, //!< Fingerprint trigger hit + FINGERPRINT_RET_MAX, //!< Fingerprint max length reached before hit + FINGERPRINT_RET_OTHER //!< Fingerprint function error returned +}; + +#define FINGERPRINT_MAX_WINDOW 48 + +/** + * @brief Context for rolling_hash2 functions + */ +struct rh_state2 { + uint8_t history[FINGERPRINT_MAX_WINDOW]; + uint64_t table1[256]; + uint64_t table2[256]; + uint64_t hash; + uint32_t w; +}; + +/** + * @brief Initialize state object for rolling hash2 + * + * @param state Structure holding state info on current rolling hash + * @param w Window width (1 <= w <= 32) + * @returns 0 - success, -1 - failure + */ +int rolling_hash2_init(struct rh_state2 *state, uint32_t w); + +/** + * @brief Reset the hash state history + * + * @param state Structure holding state info on current rolling hash + * @param init_bytes Optional window size buffer to pre-init hash + * @returns none + */ +void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes); + +/** + * @brief Run rolling hash function until trigger met or max length reached + * + * Checks for trigger based on a random hash in a sliding window. + * @param state Structure holding state info on current rolling hash + * @param buffer Pointer to input buffer to run windowed hash on + * @param max_len Max length to run over input + * @param mask Mask bits ORed with hash before test with trigger + * @param trigger Match value to compare with windowed hash at each input byte + * @param offset Offset from buffer to match, set if match found + * @returns FINGERPRINT_RET_HIT - match found, FINGERPRINT_RET_MAX - exceeded max length + */ +int rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t max_len, + uint32_t mask, uint32_t trigger, uint32_t * offset); + +/** + * @brief Generate an appropriate mask to target mean hit rate + * + * @param mean Target chunk size in bytes + * @param shift Bits to rotate result to get independent masks + * @returns 32-bit mask value + */ +uint32_t rolling_hashx_mask_gen(long mean, int shift); + +#ifdef __cplusplus +} +#endif + +#endif // _ROLLING_HASHX_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h new file mode 100644 index 000000000..3a41684b4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h @@ -0,0 +1,450 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _SHA1_MB_H_ +#define _SHA1_MB_H_ + +/** + * @file sha1_mb.h + * @brief Multi-buffer CTX API SHA1 function prototypes and structures + * + * Interface for multi-buffer SHA1 functions + * + * Multi-buffer SHA1 Entire or First-Update..Update-Last + * + * The interface to this multi-buffer hashing code is carried out through the + * context-level (CTX) init, submit and flush functions and the SHA1_HASH_CTX_MGR and + * SHA1_HASH_CTX objects. Numerous SHA1_HASH_CTX objects may be instantiated by the + * application for use with a single SHA1_HASH_CTX_MGR. + * + * The CTX interface functions carry out the initialization and padding of the jobs + * entered by the user and add them to the multi-buffer manager. The lower level "scheduler" + * layer then processes the jobs in an out-of-order manner. The scheduler layer functions + * are internal and are not intended to be invoked directly. Jobs can be submitted + * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial + * jobs which can be started using the HASH_FIRST flag, and later resumed or finished + * using the HASH_UPDATE and HASH_LAST flags respectively. + * + * Note: The submit function does not require data buffers to be block sized. + * + * The SHA1 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and + * AVX512. In addition, a multibinary interface is provided, which selects the appropriate + * architecture-specific function at runtime. + * + * Usage: The application creates a SHA1_HASH_CTX_MGR object and initializes it + * with a call to sha1_ctx_mgr_init*() function, where henceforth "*" stands for the + * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the + * multibinary version). The SHA1_HASH_CTX_MGR object will be used to schedule processor + * resources, with up to 4 SHA1_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512) + * being processed at a time. + * + * Each SHA1_HASH_CTX must be initialized before first use by the hash_ctx_init macro + * defined in multi_buffer.h. After initialization, the application may begin computing + * a hash by giving the SHA1_HASH_CTX to a SHA1_HASH_CTX_MGR using the submit functions + * sha1_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA1_HASH_CTX is + * returned to the application (via this or a later call to sha1_ctx_mgr_submit*() or + * sha1_ctx_mgr_flush*()), the application can then re-submit it with another call to + * sha1_ctx_mgr_submit*(), but without the HASH_FIRST flag set. + * + * Ideally, on the last buffer for that hash, sha1_ctx_mgr_submit_sse is called with + * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero + * length if necessary. When a SHA1_HASH_CTX is returned after having been submitted with + * HASH_LAST, it will contain a valid hash. The SHA1_HASH_CTX can be reused immediately + * by submitting with HASH_FIRST. + * + * For example, you would submit hashes with the following flags for the following numbers + * of buffers: + *
    + *
  • one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE) + *
  • two buffers: HASH_FIRST, HASH_LAST + *
  • three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST + * etc. + *
+ * + * The order in which SHA1_CTX objects are returned is in general different from the order + * in which they are submitted. + * + * A few possible error conditions exist: + *
    + *
  • Submitting flags other than the allowed entire/first/update/last values + *
  • Submitting a context that is currently being managed by a SHA1_HASH_CTX_MGR. + *
  • Submitting a context after HASH_LAST is used but before HASH_FIRST is set. + *
+ * + * These error conditions are reported by returning the SHA1_HASH_CTX immediately after + * a submit with its error member set to a non-zero error code (defined in + * multi_buffer.h). No changes are made to the SHA1_HASH_CTX_MGR in the case of an + * error; no processing is done for other hashes. + * + */ + +#include +#include "multi_buffer.h" +#include "types.h" + +#ifndef _MSC_VER +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Hash Constants and Typedefs +#define SHA1_DIGEST_NWORDS 5 +#define SHA1_MAX_LANES 16 +#define SHA1_X8_LANES 8 +#define SHA1_MIN_LANES 4 +#define SHA1_BLOCK_SIZE 64 +#define SHA1_LOG2_BLOCK_SIZE 6 +#define SHA1_PADLENGTHFIELD_SIZE 8 +#define SHA1_INITIAL_DIGEST \ + 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 + +typedef uint32_t sha1_digest_array[SHA1_DIGEST_NWORDS][SHA1_MAX_LANES]; +typedef uint32_t SHA1_WORD_T; + +/** @brief Scheduler layer - Holds info describing a single SHA1 job for the multi-buffer manager */ + +typedef struct { + uint8_t* buffer; //!< pointer to data buffer for this job + uint32_t len; //!< length of buffer for this job in blocks. + DECLARE_ALIGNED(uint32_t result_digest[SHA1_DIGEST_NWORDS],64); + JOB_STS status; //!< output job status + void* user_data; //!< pointer for user's job-related data +} SHA1_JOB; + +/** @brief Scheduler layer - Holds arguments for submitted SHA1 job */ + +typedef struct { + sha1_digest_array digest; + uint8_t* data_ptr[SHA1_MAX_LANES]; +} SHA1_MB_ARGS_X16; + +/** @brief Scheduler layer - Lane data */ + +typedef struct { + SHA1_JOB *job_in_lane; +} SHA1_LANE_DATA; + +/** @brief Scheduler layer - Holds state for multi-buffer SHA1 jobs */ + +typedef struct { + SHA1_MB_ARGS_X16 args; + uint32_t lens[SHA1_MAX_LANES]; + uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7 or 0...15) of unused lanes, nibble 4 or 8 is set to F as a flag + SHA1_LANE_DATA ldata[SHA1_MAX_LANES]; + uint32_t num_lanes_inuse; +} SHA1_MB_JOB_MGR; + +/** @brief Context layer - Holds state for multi-buffer SHA1 jobs */ + +typedef struct { + SHA1_MB_JOB_MGR mgr; +} SHA1_HASH_CTX_MGR; + +/** @brief Context layer - Holds info describing a single SHA1 job for the multi-buffer CTX manager */ + +typedef struct { + SHA1_JOB job; // Must be at struct offset 0. + HASH_CTX_STS status; //!< Context status flag + HASH_CTX_ERROR error; //!< Context error flag + uint64_t total_length; //!< Running counter of length processed for this CTX's job + const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job + uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes. + uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; //!< CTX partial blocks + uint32_t partial_block_buffer_length; + void* user_data; //!< pointer for user to keep any job-related data +} SHA1_HASH_CTX; + +/******************** multibinary function prototypes **********************/ + +/** + * @brief Initialize the SHA1 multi-buffer manager structure. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the multi-buffer manager. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush (SHA1_HASH_CTX_MGR* mgr); + + +/******************************************************************* + * Context level API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the context level SHA1 multi-buffer manager structure. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_sse (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the context level multi-buffer manager. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the context level SHA1 multi-buffer manager structure. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_sse_ni (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the context level multi-buffer manager. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse_ni (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA1 multi-buffer manager structure. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_avx (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the multi-buffer manager. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA1 multi-buffer manager structure. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_avx2 (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the multi-buffer manager. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx2 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx2 (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA1 multi-buffer manager structure. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_avx512 (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the multi-buffer manager. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512 (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA1 multi-buffer manager structure. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha1_ctx_mgr_init_avx512_ni (SHA1_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA1 job to the multi-buffer manager. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA1 jobs and return when complete. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512_ni (SHA1_HASH_CTX_MGR* mgr); + + +/******************************************************************* + * Scheduler (internal) level out-of-order function prototypes + ******************************************************************/ + +void sha1_mb_mgr_init_sse (SHA1_MB_JOB_MGR *state); +SHA1_JOB* sha1_mb_mgr_submit_sse (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_sse (SHA1_MB_JOB_MGR *state); + +#define sha1_mb_mgr_init_avx sha1_mb_mgr_init_sse +SHA1_JOB* sha1_mb_mgr_submit_avx (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_avx (SHA1_MB_JOB_MGR *state); + +void sha1_mb_mgr_init_avx2 (SHA1_MB_JOB_MGR *state); +SHA1_JOB* sha1_mb_mgr_submit_avx2 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_avx2 (SHA1_MB_JOB_MGR *state); + +void sha1_mb_mgr_init_avx512 (SHA1_MB_JOB_MGR *state); +SHA1_JOB* sha1_mb_mgr_submit_avx512 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_avx512 (SHA1_MB_JOB_MGR *state); + +void sha1_mb_mgr_init_sse_ni (SHA1_MB_JOB_MGR *state); +SHA1_JOB* sha1_mb_mgr_submit_sse_ni (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_sse_ni (SHA1_MB_JOB_MGR *state); + +void sha1_mb_mgr_init_avx512_ni (SHA1_MB_JOB_MGR *state); +SHA1_JOB* sha1_mb_mgr_submit_avx512_ni (SHA1_MB_JOB_MGR *state, SHA1_JOB* job); +SHA1_JOB* sha1_mb_mgr_flush_avx512_ni (SHA1_MB_JOB_MGR *state); + +#ifdef __cplusplus +} +#endif + +#endif // _SHA1_MB_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h new file mode 100644 index 000000000..8ef186b2d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h @@ -0,0 +1,451 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _SHA256_MB_H_ +#define _SHA256_MB_H_ + +/** + * @file sha256_mb.h + * @brief Multi-buffer CTX API SHA256 function prototypes and structures + * + * Interface for multi-buffer SHA256 functions + * + * Multi-buffer SHA256 Entire or First-Update..Update-Last + * + * The interface to this multi-buffer hashing code is carried out through the + * context-level (CTX) init, submit and flush functions and the SHA256_HASH_CTX_MGR and + * SHA256_HASH_CTX objects. Numerous SHA256_HASH_CTX objects may be instantiated by the + * application for use with a single SHA256_HASH_CTX_MGR. + * + * The CTX interface functions carry out the initialization and padding of the jobs + * entered by the user and add them to the multi-buffer manager. The lower level "scheduler" + * layer then processes the jobs in an out-of-order manner. The scheduler layer functions + * are internal and are not intended to be invoked directly. Jobs can be submitted + * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial + * jobs which can be started using the HASH_FIRST flag, and later resumed or finished + * using the HASH_UPDATE and HASH_LAST flags respectively. + * + * Note: The submit function does not require data buffers to be block sized. + * + * The SHA256 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and + * AVX512. In addition, a multibinary interface is provided, which selects the appropriate + * architecture-specific function at runtime. + * + * Usage: The application creates a SHA256_HASH_CTX_MGR object and initializes it + * with a call to sha256_ctx_mgr_init*() function, where henceforth "*" stands for the + * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the + * multibinary version). The SHA256_HASH_CTX_MGR object will be used to schedule processor + * resources, with up to 4 SHA256_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512) + * being processed at a time. + * + * Each SHA256_HASH_CTX must be initialized before first use by the hash_ctx_init macro + * defined in multi_buffer.h. After initialization, the application may begin computing + * a hash by giving the SHA256_HASH_CTX to a SHA256_HASH_CTX_MGR using the submit functions + * sha256_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA256_HASH_CTX is + * returned to the application (via this or a later call to sha256_ctx_mgr_submit*() or + * sha256_ctx_mgr_flush*()), the application can then re-submit it with another call to + * sha256_ctx_mgr_submit*(), but without the HASH_FIRST flag set. + * + * Ideally, on the last buffer for that hash, sha256_ctx_mgr_submit_sse is called with + * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero + * length if necessary. When a SHA256_HASH_CTX is returned after having been submitted with + * HASH_LAST, it will contain a valid hash. The SHA256_HASH_CTX can be reused immediately + * by submitting with HASH_FIRST. + * + * For example, you would submit hashes with the following flags for the following numbers + * of buffers: + *
    + *
  • one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE) + *
  • two buffers: HASH_FIRST, HASH_LAST + *
  • three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST + * etc. + *
+ * + * The order in which SHA256_CTX objects are returned is in general different from the order + * in which they are submitted. + * + * A few possible error conditions exist: + *
    + *
  • Submitting flags other than the allowed entire/first/update/last values + *
  • Submitting a context that is currently being managed by a SHA256_HASH_CTX_MGR. + *
  • Submitting a context after HASH_LAST is used but before HASH_FIRST is set. + *
+ * + * These error conditions are reported by returning the SHA256_HASH_CTX immediately after + * a submit with its error member set to a non-zero error code (defined in + * multi_buffer.h). No changes are made to the SHA256_HASH_CTX_MGR in the case of an + * error; no processing is done for other hashes. + * + */ + +#include +#include "multi_buffer.h" +#include "types.h" + +#ifndef _MSC_VER +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Hash Constants and Typedefs +#define SHA256_DIGEST_NWORDS 8 +#define SHA256_MAX_LANES 16 +#define SHA256_X8_LANES 8 +#define SHA256_MIN_LANES 4 +#define SHA256_BLOCK_SIZE 64 +#define SHA256_LOG2_BLOCK_SIZE 6 +#define SHA256_PADLENGTHFIELD_SIZE 8 +#define SHA256_INITIAL_DIGEST \ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, \ + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +typedef uint32_t sha256_digest_array[SHA256_DIGEST_NWORDS][SHA256_MAX_LANES]; +typedef uint32_t SHA256_WORD_T; + +/** @brief Scheduler layer - Holds info describing a single SHA256 job for the multi-buffer manager */ + +typedef struct { + uint8_t* buffer; //!< pointer to data buffer for this job + uint64_t len; //!< length of buffer for this job in blocks. + DECLARE_ALIGNED(uint32_t result_digest[SHA256_DIGEST_NWORDS], 64); + JOB_STS status; //!< output job status + void* user_data; //!< pointer for user's job-related data +} SHA256_JOB; + +/** @brief Scheduler layer - Holds arguments for submitted SHA256 job */ + +typedef struct { + sha256_digest_array digest; + uint8_t* data_ptr[SHA256_MAX_LANES]; +} SHA256_MB_ARGS_X16; + +/** @brief Scheduler layer - Lane data */ + +typedef struct { + SHA256_JOB *job_in_lane; +} SHA256_LANE_DATA; + +/** @brief Scheduler layer - Holds state for multi-buffer SHA256 jobs */ + +typedef struct { + SHA256_MB_ARGS_X16 args; + uint32_t lens[SHA256_MAX_LANES]; + uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag + SHA256_LANE_DATA ldata[SHA256_MAX_LANES]; + uint32_t num_lanes_inuse; +} SHA256_MB_JOB_MGR; + +/** @brief Context layer - Holds state for multi-buffer SHA256 jobs */ + +typedef struct { + SHA256_MB_JOB_MGR mgr; +} SHA256_HASH_CTX_MGR; + +/** @brief Context layer - Holds info describing a single SHA256 job for the multi-buffer CTX manager */ + +typedef struct { + SHA256_JOB job; // Must be at struct offset 0. + HASH_CTX_STS status; //!< Context status flag + HASH_CTX_ERROR error; //!< Context error flag + uint64_t total_length; //!< Running counter of length processed for this CTX's job + const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job + uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes. + uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; //!< CTX partial blocks + uint32_t partial_block_buffer_length; + void* user_data; //!< pointer for user to keep any job-related data +} SHA256_HASH_CTX; + +/******************** multibinary function prototypes **********************/ + +/** + * @brief Initialize the SHA256 multi-buffer manager structure. + * @requires SSE4.1 or AVX or AVX2 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the multi-buffer manager. + * @requires SSE4.1 or AVX or AVX2 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires SSE4.1 or AVX or AVX2 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush (SHA256_HASH_CTX_MGR* mgr); + + +/******************************************************************* + * CTX level API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the context level SHA256 multi-buffer manager structure. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_sse (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the context level multi-buffer manager. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the context level SHA256 multi-buffer manager structure. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_sse_ni (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the context level multi-buffer manager. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires SSE4.1 and SHANI + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse_ni (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA256 multi-buffer manager structure. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_avx (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the multi-buffer manager. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA256 multi-buffer manager structure. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_avx2 (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the multi-buffer manager. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx2 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx2 (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA256 multi-buffer manager structure. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_avx512 (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the multi-buffer manager. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512 (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA256 multi-buffer manager structure. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha256_ctx_mgr_init_avx512_ni (SHA256_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA256 job to the multi-buffer manager. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA256 jobs and return when complete. + * @requires AVX512 and SHANI + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512_ni (SHA256_HASH_CTX_MGR* mgr); + + +/******************************************************************* + * Scheduler (internal) level out-of-order function prototypes + ******************************************************************/ + +void sha256_mb_mgr_init_sse (SHA256_MB_JOB_MGR *state); +SHA256_JOB* sha256_mb_mgr_submit_sse (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_sse (SHA256_MB_JOB_MGR *state); + +#define sha256_mb_mgr_init_avx sha256_mb_mgr_init_sse +SHA256_JOB* sha256_mb_mgr_submit_avx (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_avx (SHA256_MB_JOB_MGR *state); + +void sha256_mb_mgr_init_avx2 (SHA256_MB_JOB_MGR *state); +SHA256_JOB* sha256_mb_mgr_submit_avx2 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_avx2 (SHA256_MB_JOB_MGR *state); + +void sha256_mb_mgr_init_avx512 (SHA256_MB_JOB_MGR *state); +SHA256_JOB* sha256_mb_mgr_submit_avx512 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_avx512 (SHA256_MB_JOB_MGR *state); + +void sha256_mb_mgr_init_sse_ni (SHA256_MB_JOB_MGR *state); +SHA256_JOB* sha256_mb_mgr_submit_sse_ni (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_sse_ni (SHA256_MB_JOB_MGR *state); + +void sha256_mb_mgr_init_avx512_ni (SHA256_MB_JOB_MGR *state); +SHA256_JOB* sha256_mb_mgr_submit_avx512_ni (SHA256_MB_JOB_MGR *state, SHA256_JOB* job); +SHA256_JOB* sha256_mb_mgr_flush_avx512_ni (SHA256_MB_JOB_MGR *state); + +#ifdef __cplusplus +} +#endif + +#endif // _SHA256_MB_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h new file mode 100644 index 000000000..ce3950ad1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h @@ -0,0 +1,422 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _SHA512_MB_H_ +#define _SHA512_MB_H_ + +/** + * @file sha512_mb.h + * @brief Single/Multi-buffer CTX API SHA512 function prototypes and structures + * + * Interface for single and multi-buffer SHA512 functions + * + * Single/Multi-buffer SHA512 Entire or First-Update..Update-Last + * + * The interface to this single/multi-buffer hashing code is carried out through the + * context-level (CTX) init, submit and flush functions and the SHA512_HASH_CTX_MGR and + * SHA512_HASH_CTX objects. Numerous SHA512_HASH_CTX objects may be instantiated by the + * application for use with a single SHA512_HASH_CTX_MGR. + * + * The CTX interface functions carry out the initialization and padding of the jobs + * entered by the user and add them to the multi-buffer manager. The lower level "scheduler" + * layer then processes the jobs in an out-of-order manner. The scheduler layer functions + * are internal and are not intended to be invoked directly. Jobs can be submitted + * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial + * jobs which can be started using the HASH_FIRST flag, and later resumed or finished + * using the HASH_UPDATE and HASH_LAST flags respectively. + * + * Note: The submit function does not require data buffers to be block sized. + * + * The SHA512 CTX interface functions are available for 5 architectures: multi-buffer SSE, + * AVX, AVX2, AVX512 and single-buffer SSE4 (which is used in the same way as the + * multi-buffer code). In addition, a multibinary interface is provided, which selects the + * appropriate architecture-specific function at runtime. This multibinary interface + * selects the single buffer SSE4 functions when the platform is detected to be Silvermont. + * + * Usage: The application creates a SHA512_HASH_CTX_MGR object and initializes it + * with a call to sha512_ctx_mgr_init*() function, where henceforth "*" stands for the + * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the + * multibinary version). The SHA512_HASH_CTX_MGR object will be used to schedule processor + * resources, with up to 2 SHA512_HASH_CTX objects (or 4 in the AVX2 case, 8 in the AVX512 + * case) being processed at a time. + * + * Each SHA512_HASH_CTX must be initialized before first use by the hash_ctx_init macro + * defined in multi_buffer.h. After initialization, the application may begin computing + * a hash by giving the SHA512_HASH_CTX to a SHA512_HASH_CTX_MGR using the submit functions + * sha512_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA512_HASH_CTX is + * returned to the application (via this or a later call to sha512_ctx_mgr_submit*() or + * sha512_ctx_mgr_flush*()), the application can then re-submit it with another call to + * sha512_ctx_mgr_submit*(), but without the HASH_FIRST flag set. + * + * Ideally, on the last buffer for that hash, sha512_ctx_mgr_submit_sse is called with + * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero + * length if necessary. When a SHA512_HASH_CTX is returned after having been submitted with + * HASH_LAST, it will contain a valid hash. The SHA512_HASH_CTX can be reused immediately + * by submitting with HASH_FIRST. + * + * For example, you would submit hashes with the following flags for the following numbers + * of buffers: + *
    + *
  • one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE) + *
  • two buffers: HASH_FIRST, HASH_LAST + *
  • three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST + * etc. + *
+ * + * The order in which SHA512_CTX objects are returned is in general different from the order + * in which they are submitted. + * + * A few possible error conditions exist: + *
    + *
  • Submitting flags other than the allowed entire/first/update/last values + *
  • Submitting a context that is currently being managed by a SHA512_HASH_CTX_MGR. (Note: + * This error case is not applicable to the single buffer SSE4 version) + *
  • Submitting a context after HASH_LAST is used but before HASH_FIRST is set. + *
+ * + * These error conditions are reported by returning the SHA512_HASH_CTX immediately after + * a submit with its error member set to a non-zero error code (defined in + * multi_buffer.h). No changes are made to the SHA512_HASH_CTX_MGR in the case of an + * error; no processing is done for other hashes. + * + */ + +#include +#include "multi_buffer.h" +#include "types.h" + +#ifndef _MSC_VER +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Hash Constants and Typedefs +#define SHA512_DIGEST_NWORDS 8 +#define SHA512_MAX_LANES 8 +#define SHA512_X4_LANES 4 +#define SHA512_MIN_LANES 2 +#define SHA512_BLOCK_SIZE 128 +#define SHA512_LOG2_BLOCK_SIZE 7 +#define SHA512_PADLENGTHFIELD_SIZE 16 +#define SHA512_INITIAL_DIGEST \ + 0x6a09e667f3bcc908,0xbb67ae8584caa73b,0x3c6ef372fe94f82b,0xa54ff53a5f1d36f1, \ + 0x510e527fade682d1,0x9b05688c2b3e6c1f,0x1f83d9abfb41bd6b,0x5be0cd19137e2179 + + +typedef uint64_t sha512_digest_array[SHA512_DIGEST_NWORDS][SHA512_MAX_LANES]; +typedef uint64_t SHA512_WORD_T; + +/** @brief Scheduler layer - Holds info describing a single SHA512 job for the multi-buffer manager */ + +typedef struct { + uint8_t* buffer; //!< pointer to data buffer for this job + uint64_t len; //!< length of buffer for this job in blocks. + DECLARE_ALIGNED(uint64_t result_digest[SHA512_DIGEST_NWORDS], 64); + JOB_STS status; //!< output job status + void* user_data; //!< pointer for user's job-related data +} SHA512_JOB; + +/** @brief Scheduler layer - Holds arguments for submitted SHA512 job */ + +typedef struct { + sha512_digest_array digest; + uint8_t* data_ptr[SHA512_MAX_LANES]; +} SHA512_MB_ARGS_X8; + +/** @brief Scheduler layer - Lane data */ + +typedef struct { + SHA512_JOB *job_in_lane; +} SHA512_LANE_DATA; + +/** @brief Scheduler layer - Holds state for multi-buffer SHA512 jobs */ + +typedef struct { + SHA512_MB_ARGS_X8 args; + uint64_t lens[SHA512_MAX_LANES]; + uint64_t unused_lanes; //!< each byte is index (00, 01 or 00...03) of unused lanes, byte 2 or 4 is set to FF as a flag + SHA512_LANE_DATA ldata[SHA512_MAX_LANES]; + uint32_t num_lanes_inuse; +} SHA512_MB_JOB_MGR; + +/** @brief Context layer - Holds state for multi-buffer SHA512 jobs */ + +typedef struct { + SHA512_MB_JOB_MGR mgr; +} SHA512_HASH_CTX_MGR; + +/** @brief Context layer - Holds info describing a single SHA512 job for the multi-buffer CTX manager */ + +typedef struct { + SHA512_JOB job; // Must be at struct offset 0. + HASH_CTX_STS status; //!< Context status flag + HASH_CTX_ERROR error; //!< Context error flag + uint64_t total_length; //!< Running counter of length processed for this CTX's job + const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job + uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes. + uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; //!< CTX partial blocks + uint32_t partial_block_buffer_length; + void* user_data; //!< pointer for user to keep any job-related data +} SHA512_HASH_CTX; + +/******************************************************************* + * Context level API function prototypes + ******************************************************************/ + +/** + * @brief Initialize the context level SHA512 multi-buffer manager structure. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init_sse (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the context level multi-buffer manager. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit_sse (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires SSE4.1 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush_sse (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA512 multi-buffer manager structure. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init_avx (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the multi-buffer manager. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires AVX + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA512 multi-buffer manager structure. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init_avx2 (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the multi-buffer manager. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx2 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires AVX2 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx2 (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA512 multi-buffer manager structure. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init_avx512 (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the multi-buffer manager. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx512 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx512 (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Initialize the SHA512 multi-buffer manager structure. + * @requires SSE4 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init_sb_sse4 (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the multi-buffer manager. + * @requires SSE4 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit_sb_sse4 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires SSE4 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush_sb_sse4 (SHA512_HASH_CTX_MGR* mgr); + +/******************** multibinary function prototypes **********************/ + +/** + * @brief Initialize the SHA512 multi-buffer manager structure. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns void + */ +void sha512_ctx_mgr_init (SHA512_HASH_CTX_MGR* mgr); + +/** + * @brief Submit a new SHA512 job to the multi-buffer manager. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @param ctx Structure holding ctx job info + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param flags Input flag specifying job type (first, update, last or entire) + * @returns NULL if no jobs complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_submit (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx, + const void* buffer, uint32_t len, HASH_CTX_FLAG flags); + +/** + * @brief Finish all submitted SHA512 jobs and return when complete. + * @requires SSE4.1 or AVX or AVX2 or AVX512 + * + * @param mgr Structure holding context level state info + * @returns NULL if no jobs to complete or pointer to jobs structure. + */ +SHA512_HASH_CTX* sha512_ctx_mgr_flush (SHA512_HASH_CTX_MGR* mgr); + +/******************************************************************* + * Scheduler (internal) level out-of-order function prototypes + ******************************************************************/ + +void sha512_mb_mgr_init_sse (SHA512_MB_JOB_MGR *state); +SHA512_JOB* sha512_mb_mgr_submit_sse (SHA512_MB_JOB_MGR *state, SHA512_JOB* job); +SHA512_JOB* sha512_mb_mgr_flush_sse (SHA512_MB_JOB_MGR *state); + +#define sha512_mb_mgr_init_avx sha512_mb_mgr_init_sse +SHA512_JOB* sha512_mb_mgr_submit_avx (SHA512_MB_JOB_MGR *state, SHA512_JOB* job); +SHA512_JOB* sha512_mb_mgr_flush_avx (SHA512_MB_JOB_MGR *state); + +void sha512_mb_mgr_init_avx2 (SHA512_MB_JOB_MGR *state); +SHA512_JOB* sha512_mb_mgr_submit_avx2 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job); +SHA512_JOB* sha512_mb_mgr_flush_avx2 (SHA512_MB_JOB_MGR *state); + +void sha512_mb_mgr_init_avx512 (SHA512_MB_JOB_MGR *state); +SHA512_JOB* sha512_mb_mgr_submit_avx512 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job); +SHA512_JOB* sha512_mb_mgr_flush_avx512 (SHA512_MB_JOB_MGR *state); + +// Single buffer SHA512 APIs, optimized for SLM. +void sha512_sse4 (const void* M, void* D, uint64_t L); +// Note that these APIs comply with multi-buffer APIs' high level usage +void sha512_sb_mgr_init_sse4 (SHA512_MB_JOB_MGR *state); +SHA512_JOB* sha512_sb_mgr_submit_sse4 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job); +SHA512_JOB* sha512_sb_mgr_flush_sse4 (SHA512_MB_JOB_MGR *state); + +#ifdef __cplusplus +} +#endif + +#endif // _SHA512_MB_H_ + + diff --git a/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h new file mode 100644 index 000000000..d9e7b4eed --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h @@ -0,0 +1,155 @@ +/********************************************************************** + Copyright(c) 2011-2020 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _SM3_MB_H_ +#define _SM3_MB_H_ + + +/** + * @file sm3_mb.h + * @brief Multi-buffer CTX API SM3 function prototypes and structures + * + * \warning Experimental interface with only base functions available at this + * time. + */ + +#include +#include "multi_buffer.h" +#include "types.h" + +#ifndef _MSC_VER +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SM3_DIGEST_NWORDS 8 /* Word in SM3 is 32-bit */ +#define SM3_MAX_LANES 16 +#define SM3_X8_LANES 8 +#define SM3_BLOCK_SIZE 64 +#define SM3_LOG2_BLOCK_SIZE 6 +#define SM3_PADLENGTHFIELD_SIZE 8 +#define SM3_INITIAL_DIGEST \ + 0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600, \ + 0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e + +typedef uint32_t sm3_digest_array[SM3_DIGEST_NWORDS][SM3_MAX_LANES]; +typedef uint32_t SM3_WORD_T; + +/** @brief Scheduler layer - Holds info describing a single SM3 job for the multi-buffer manager */ + +typedef struct { + uint8_t *buffer; //!< pointer to data buffer for this job + uint64_t len; //!< length of buffer for this job in blocks. + DECLARE_ALIGNED(uint32_t result_digest[SM3_DIGEST_NWORDS], 64); + JOB_STS status; //!< output job status + void *user_data; //!< pointer for user's job-related data +} SM3_JOB; + +/** @brief Scheduler layer - Holds arguments for submitted SM3 job */ + +typedef struct { + sm3_digest_array digest; + uint8_t *data_ptr[SM3_MAX_LANES]; +} SM3_MB_ARGS_X16; + +/** @brief Scheduler layer - Lane data */ + +typedef struct { + SM3_JOB *job_in_lane; +} SM3_LANE_DATA; + +/** @brief Scheduler layer - Holds state for multi-buffer SM3 jobs */ + +typedef struct { + SM3_MB_ARGS_X16 args; + uint32_t lens[SM3_MAX_LANES]; + uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag + SM3_LANE_DATA ldata[SM3_MAX_LANES]; + uint32_t num_lanes_inuse; +} SM3_MB_JOB_MGR; + +/** @brief Context layer - Holds state for multi-buffer SM3 jobs */ + +typedef struct { + SM3_MB_JOB_MGR mgr; +} SM3_HASH_CTX_MGR; + +/** @brief Context layer - Holds info describing a single SM3 job for the multi-buffer CTX manager */ + +typedef struct { + SM3_JOB job; // Must be at struct offset 0. + HASH_CTX_STS status; //!< Context status flag + HASH_CTX_ERROR error; //!< Context error flag + uint64_t total_length; //!< Running counter of length processed for this CTX's job + const void *incoming_buffer; //!< pointer to data input buffer for this CTX's job + uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes. + uint8_t partial_block_buffer[SM3_BLOCK_SIZE * 2]; //!< CTX partial blocks + uint32_t partial_block_buffer_length; + void *user_data; //!< pointer for user to keep any job-related data +} SM3_HASH_CTX; + +/******************** multibinary function prototypes **********************/ + +/** +* @brief Initialize the SM3 multi-buffer manager structure. +* +* @param mgr Structure holding context level state info +* @returns void +*/ +void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr); + +/** +* @brief Submit a new SM3 job to the multi-buffer manager. +* +* @param mgr Structure holding context level state info +* @param ctx Structure holding ctx job info +* @param buffer Pointer to buffer to be processed +* @param len Length of buffer (in bytes) to be processed +* @param flags Input flag specifying job type (first, update, last or entire) +* @returns NULL if no jobs complete or pointer to jobs structure. +*/ +SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); + +/** +* @brief Finish all submitted SM3 jobs and return when complete. +* +* @param mgr Structure holding context level state info +* @returns NULL if no jobs to complete or pointer to jobs structure. +*/ +SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/include/test.h b/src/crypto/isa-l/isa-l_crypto/include/test.h new file mode 100644 index 000000000..7b99390b8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/test.h @@ -0,0 +1,111 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _TEST_H +#define _TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "endian_helper.h" + +// Use sys/time.h functions for time +#if defined (__unix__) || (__APPLE__) || (__MINGW32__) +# include +#endif + +#ifdef _MSC_VER +# define inline __inline +# include +# include +#endif + +#include +#include + +struct perf{ + struct timeval tv; +}; + + +#if defined (__unix__) || (__APPLE__) || (__MINGW32__) +static inline int perf_start(struct perf *p) +{ + return gettimeofday(&(p->tv), 0); +} +static inline int perf_stop(struct perf *p) +{ + return gettimeofday(&(p->tv), 0); +} + +static inline void perf_print(struct perf stop, struct perf start, long long dsize) +{ + long long secs = stop.tv.tv_sec - start.tv.tv_sec; + long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec; + + printf("runtime = %10lld usecs", usecs); + if (dsize != 0) { +#if 1 // not bug in printf for 32-bit + printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024), + ((double) usecs)/1000000, ((double) dsize) / (double)usecs); +#else + printf(", bandwidth %lld MB ", dsize/(1024*1024)); + printf("in %.4f sec ",(double)usecs/1000000); + printf("= %.2f MB/s\n", (double)dsize/usecs); +#endif + } + else + printf("\n"); +} +#endif + +static inline uint64_t get_filesize(FILE *fp) +{ + uint64_t file_size; + fpos_t pos, pos_curr; + + fgetpos(fp, &pos_curr); /* Save current position */ +#if defined(_WIN32) || defined(_WIN64) + _fseeki64(fp, 0, SEEK_END); +#else + fseeko(fp, 0, SEEK_END); +#endif + fgetpos(fp, &pos); + file_size = *(uint64_t *)&pos; + fsetpos(fp, &pos_curr); /* Restore position */ + + return file_size; +} + +#ifdef __cplusplus +} +#endif + +#endif // _TEST_H diff --git a/src/crypto/isa-l/isa-l_crypto/include/types.h b/src/crypto/isa-l/isa-l_crypto/include/types.h new file mode 100644 index 000000000..de452557a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/include/types.h @@ -0,0 +1,100 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +/** + * @file types.h + * @brief Defines common align and debug macros + * + */ + +#ifndef __TYPES_H +#define __TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#if defined __unix__ || defined __APPLE__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define __forceinline static inline +# define aligned_free(x) free(x) +#else +# ifdef __MINGW32__ +# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval))) +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# else +# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl +# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn))) +# define aligned_free(x) _aligned_free(x) +# endif +#endif + +#ifdef DEBUG +# define DEBUG_PRINT(x) printf x +#else +# define DEBUG_PRINT(x) do {} while (0) +#endif + + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif +#ifndef __has_extension +# define __has_extension __has_feature +#endif +#define ISAL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) + +#if (defined(__ICC) || defined( __GNUC__ ) || defined(__clang__)) && !defined(ISAL_UNIT_TEST) +# if __has_extension(attribute_deprecated_with_message) \ + || (ISAL_GCC_VERSION >= 40500) \ + || (__INTEL_COMPILER >= 1100) +# define ISAL_DEPRECATED(message) __attribute__(( deprecated( message ))) +# else +# define ISAL_DEPRECATED(message) __attribute__(( deprecated )) +# endif +#elif (defined( __ICL ) || defined(_MSC_VER)) +# if (__INTEL_COMPILER >= 1100) || (_MSC_FULL_VER >= 140050727) +# define ISAL_DEPRECATED(message) __declspec( deprecated ( message )) +# else +# define ISAL_DEPRECATED(message) __declspec( deprecated ) +# endif +#else +# define ISAL_DEPRECATED(message) +#endif + +#define ISAL_EXPERIMENTAL(message) ISAL_DEPRECATED("Experimental: " message) + +#ifdef __cplusplus +} +#endif + +#endif //__TYPES_H diff --git a/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def new file mode 100644 index 000000000..de38b6d19 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def @@ -0,0 +1,80 @@ +LIBRARY isa-l_crypto +VERSION 2.24 +EXPORTS + +sha1_ctx_mgr_init @1 +sha1_ctx_mgr_submit @2 +sha1_ctx_mgr_flush @3 +sha256_ctx_mgr_init @4 +sha256_ctx_mgr_submit @5 +sha256_ctx_mgr_flush @6 +sha512_ctx_mgr_init @7 +sha512_ctx_mgr_submit @8 +sha512_ctx_mgr_flush @9 +md5_ctx_mgr_init @10 +md5_ctx_mgr_submit @11 +md5_ctx_mgr_flush @12 +mh_sha1_init @13 +mh_sha1_update @14 +mh_sha1_finalize @15 +mh_sha1_finalize_base @16 +mh_sha1_update_base @17 +mh_sha1_murmur3_x64_128_init @18 +mh_sha1_murmur3_x64_128_finalize_base @19 +mh_sha1_murmur3_x64_128_update_base @20 +mh_sha1_murmur3_x64_128_update @21 +mh_sha1_murmur3_x64_128_finalize @22 +aes_keyexp_128 @23 +aes_keyexp_192 @24 +aes_keyexp_256 @25 +aes_cbc_enc_128 @26 +aes_cbc_dec_128 @27 +aes_cbc_enc_192 @28 +aes_cbc_dec_192 @29 +aes_cbc_enc_256 @30 +aes_cbc_dec_256 @31 +aes_cbc_precomp @32 +XTS_AES_128_enc @33 +XTS_AES_128_enc_expanded_key @34 +XTS_AES_128_dec @35 +XTS_AES_128_dec_expanded_key @36 +XTS_AES_256_enc @37 +XTS_AES_256_enc_expanded_key @38 +XTS_AES_256_dec @39 +XTS_AES_256_dec_expanded_key @40 +mh_sha256_init @41 +mh_sha256_update @42 +mh_sha256_finalize @43 +mh_sha256_finalize_base @44 +mh_sha256_update_base @45 +rolling_hashx_mask_gen @46 +rolling_hash2_run @47 +rolling_hash2_reset @48 +rolling_hash2_init @49 +aes_gcm_pre_128 @50 +aes_gcm_enc_128 @51 +aes_gcm_dec_128 @52 +aes_gcm_init_128 @53 +aes_gcm_enc_128_update @54 +aes_gcm_dec_128_update @55 +aes_gcm_enc_128_finalize @56 +aes_gcm_dec_128_finalize @57 +aes_gcm_pre_256 @58 +aes_gcm_enc_256 @59 +aes_gcm_dec_256 @60 +aes_gcm_init_256 @61 +aes_gcm_enc_256_update @62 +aes_gcm_dec_256_update @63 +aes_gcm_enc_256_finalize @64 +aes_gcm_dec_256_finalize @65 +aes_gcm_enc_128_nt @66 +aes_gcm_dec_128_nt @67 +aes_gcm_enc_128_update_nt @68 +aes_gcm_dec_128_update_nt @69 +aes_gcm_enc_256_nt @70 +aes_gcm_dec_256_nt @71 +aes_gcm_enc_256_update_nt @72 +aes_gcm_dec_256_update_nt @73 +sm3_ctx_mgr_init @74 +sm3_ctx_mgr_submit @75 +sm3_ctx_mgr_flush @76 diff --git a/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in new file mode 100644 index 000000000..41ba8d5a3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in @@ -0,0 +1,11 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libisal_crypto +Description: Crypto library for storage systems +Version: @VERSION@ +Libs: -L${libdir} -lisal_crypto +Libs.private: +Cflags: -I${includedir} diff --git a/src/crypto/isa-l/isa-l_crypto/make.inc b/src/crypto/isa-l/isa-l_crypto/make.inc new file mode 100644 index 000000000..0cb94d12d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/make.inc @@ -0,0 +1,340 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + + +# Makefile include for optimized libraries +# make targets: +# lib - build library of optimized functions +# slib - build shared library +# test - run unit tests of functions +# perf - run performance tests +# install - install headers and libs to system location +# sim - run on simulator +# trace - get simulator trace +# clean - remove object files + +version ?= 2.24.0 + + + +CC = gcc +AS = nasm +AWK = awk + +DEBUG = -g +DEBUG_yasm = -g dwarf2 +DEBUG_nasm = -g + +# Default arch= build options +CFLAGS_ = -Wall +ASFLAGS_ = -f elf64 +ARFLAGS_ = cr $@ +STRIP_gcc = strip -d -R .comment $@ + +# arch=32 build options +ASFLAGS_32 = -f elf32 +CFLAGS_32 = -m32 +ARFLAGS_32 = cr $@ + +# arch=win64 build options +ASFLAGS_win64 = -f win64 +CFLAGS_icl = -Qstd=c99 +ARFLAGS_win64 = -out:$@ + +# arch=mingw build options +ASFLAGS_mingw = -f win64 +ARFLAGS_mingw = cr $@ + +LDFLAGS_so = -Wl,-soname,$(soname) + +ifeq ($(arch),mingw) + CC=x86_64-w64-mingw32-gcc + AR=x86_64-w64-mingw32-ar + LDFLAGS += -Wl,--force-exe-suffix + SIM=wine + EXT=.exe + CLEANFILES+=*.exe +endif + +ASFLAGS_Darwin = -f macho64 --prefix=_ +ARFLAGS_Darwin = -r $@ +ifeq ($(shell uname),Darwin) + LDFLAGS_so = + STRIP_gcc = +endif + +# arch=aarch64 build options +ifeq ($(lib_debug),1) + ASFLAGS_aarch64 = -g -c +else + ASFLAGS_aarch64 = -c +endif + +ARFLAGS_aarch64 = cr $@ +ifeq ($(arch),aarch64) + AS=$(CC) -D__ASSEMBLY__ + SIM= +endif +# arch=noarch build options +ARFLAGS_noarch = cr $@ +CFLAGS_noarch= -DNOARCH +ifeq ($(arch),noarch) + host_cpu=base_aliases +endif +INCLUDE = $(patsubst %,-I%/,$(subst :, ,$(VPATH))) +CFLAGS = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE) +ASFLAGS = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE) +ARFLAGS = $(ARFLAGS_$(arch)) +DEFINES += $(addprefix -D , $D) +CLEANFILES += $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name) + +ifeq ($(filter aarch64 x86_%,$(host_cpu)),) + host_cpu=base_aliases +endif +lsrc += $(lsrc_$(host_cpu)) +O = bin +lobj += $(patsubst %.c,%.o,$(patsubst %.S,%.o,$(patsubst %.asm,%.o,$(lsrc) $(lsrc_intrinsic)))) +objs = $(addprefix $(O)/,$(notdir $(lobj))) + + +lib_name ?= isa-l_crypto.a +default: lib slib + +# Defaults for windows build +ifeq ($(arch),win64) + AR=lib + CC=cl + OUTPUT_OPTION = -Fo$@ + DEBUG= + lib_name := $(basename $(lib_name)).lib +endif +lsrcwin64 = $(lsrc) +unit_testswin64 = $(unit_tests) +exampleswin64 = $(examples) +perf_testswin64 = $(perf_tests) + + +# Build and run unit tests, performance tests, etc. +all_tests = $(notdir $(sort $(perf_tests) $(check_tests) $(unit_tests) $(examples) $(other_tests))) +all_unit_tests = $(notdir $(sort $(check_tests) $(unit_tests))) +all_perf_tests = $(notdir $(sort $(perf_tests))) +all_check_tests = $(notdir $(sort $(check_tests))) + +$(all_unit_tests): % : %.c $(lib_name) +$(all_perf_tests): % : %.c $(lib_name) +$(sort $(notdir $(examples))): % : %.c $(lib_name) +$(sort $(notdir $(other_tests))): % : %.c $(lib_name) + +# Check for modern as +test-as = $(shell hash printf && printf $(3) > $(2) && $(AS) $(ASFLAGS) ${tmpf} -o /dev/null 2> /dev/null && echo $(1) || echo $(4)) +as_4 := "pblendvb xmm2, xmm1;" +as_6 := "vinserti32x8 zmm0, ymm1, 1;" +as_10 := "vpcompressb zmm0 {k1}, zmm1;" + +tmpf := $(shell mktemp) +as_feature_level := $(call test-as, 4, $(tmpf), $(as_4), $(as_feature_level)) +as_feature_level := $(call test-as, 6, $(tmpf), $(as_6), $(as_feature_level)) +as_feature_level := $(call test-as, 10, $(tmpf), $(as_10), $(as_feature_level)) +tmpf := $(shell rm ${tmpf}) + +ifneq ($(findstring $(as_feature_level),6 10),) + D_HAVE_AS_KNOWS_AVX512_y := -DHAVE_AS_KNOWS_AVX512 +endif + +CFLAGS += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y) +ASFLAGS += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y) + +sim test trace: $(addsuffix .run,$(all_unit_tests)) +perf: $(addsuffix .run,$(all_perf_tests)) +check: $(addsuffix .run,$(all_check_tests)) +ex: $(notdir $(examples)) +all: lib $(all_tests) +other: $(notdir $(other_tests)) +tests: $(all_unit_tests) +perfs: $(all_perf_tests) +checks: $(all_check_tests) +trace: SIM=sde -debugtrace -- +sim: SIM=sde -- +check test sim: + @echo Finished running $@ + +$(objs): | $(O) +$(O): ; mkdir -p $(O) + +# Build rule to run tests +$(addsuffix .run,$(all_tests)): %.run : % + $(SIM) ./$<$(EXT) + @echo Completed run: $< + +# Other build rules +msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(host_cpu)) $D + +# gcc assembly files +$(O)/%.o: $(host_cpu)/%.S + @echo " ---> Building $< $(msg)" + @$(AS) $(ASFLAGS) -o $@ $< + +$(O)/%.o : $(host_cpu)/%.c + @echo " ---> Building $< $(msg)" + @$(COMPILE.c) $(OUTPUT_OPTION) $< +# yasm/nasm assembly files +$(O)/%.o: %.asm + @echo " ---> Building $< $(msg)" + @$(AS) $(ASFLAGS) -o $@ $< + + +$(O)/%.o %.o: %.c + @echo " ---> Building $< $(msg)" + @$(COMPILE.c) $(OUTPUT_OPTION) $< + +$(all_tests): + @echo " ---> Building Test $@ $(msg)" + @$(LINK.o) $(CFLAGS) $^ $(LDLIBS) -o $@ + + +# Target to build lib files +lib: $(lib_name) +ifneq ($(lib_debug),1) + $(lib_name): DEBUG_$(AS)= # Don't put debug symbols in the lib + $(lib_name): DEBUG= + $(lib_name): DEFINES+=-D NDEBUG +endif +ifeq ($(lib_debug),1) + DEBUG+=-D DEBUG # Define DEBUG for macros +endif + +#lib $(lib_name): $(lib_name)(${objs}) +$(lib_name): $(objs) + @echo " ---> Creating Lib $@" + @$(AR) $(ARFLAGS) $^ +ifneq ($(lib_debug),1) + @$(STRIP_$(CC)) +endif + + +# Target for shared lib +so_lib_name = bin/libisal_crypto.so +so_lib_inst = $(notdir $(so_lib_name)) +so_lib_ver = $(so_lib_inst).$(version) +soname = $(so_lib_inst).$(word 1, $(subst ., ,$(version))) + +slib: $(so_lib_name) +aobjs += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc) $(lsrc_intrinsic))))) +aobjs += $(addprefix $(O)/,$(patsubst %.S,%.o,$(filter %.S,$(notdir $(lsrc) $(lsrc_intrinsic))))) +shared_objs += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc) $(lsrc_intrinsic))))) + +$(O)/shared_ver_%.o: %.c + @echo " ---> Building shared $< $(msg)" + @$(COMPILE.c) $(OUTPUT_OPTION) $< + +$(O)/shared_ver_%.o: $(host_cpu)/%.c + @echo " ---> Building shared $< $(msg)" + @$(COMPILE.c) $(OUTPUT_OPTION) $< +ifneq ($(lib_debug),1) + $(so_lib_name): DEBUG_$(AS)= + $(so_lib_name): DEBUG= + $(so_lib_name): DEFINES+=-D NDEBUG +endif + +$(shared_objs): CFLAGS += -fPIC +$(shared_objs) $(aobjs): | $(O) +$(so_lib_name): LDFLAGS+=$(LDFLAGS_so) +$(so_lib_name): $(shared_objs) $(aobjs) + @echo " ---> Creating Shared Lib $@" + @$(CC) $(CFLAGS) --shared $(LDFLAGS) -o $@ $^ + @(cd $(@D); ln -f -s $(so_lib_inst) $(soname)) + +isa-l_crypto.h: + @echo 'Building $@' + @echo '' >> $@ + @echo '/**' >> $@ + @echo ' * @file isa-l_crypto.h'>> $@ + @echo ' * @brief Include for ISA-L_crypto library' >> $@ + @echo ' */' >> $@ + @echo '' >> $@ + @echo '#ifndef _ISAL_CRYPTO_H_' >> $@ + @echo '#define _ISAL_CRYPTO_H_' >> $@ + @echo '' >> $@ + @echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@ + @echo '#define.ISAL_CRYPTO_MINOR_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@ + @echo '#define.ISAL_CRYPTO_PATCH_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@ + @echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@ + @echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@ + @echo '' >> $@ + @for unit in $(sort $(extern_hdrs)); do echo "#include " | sed -e 's;include/;;' >> $@; done + @echo '#endif //_ISAL_CRYPTO_H_' >> $@ + + +# Target for install +prefix = /usr/local +install_dirs = $(prefix)/lib $(prefix)/include/isa-l_crypto +$(install_dirs): ; mkdir -p $@ +install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l_crypto.h + install -m 644 $(lib_name) $(prefix)/lib/libisal_crypto.a + install -m 644 $^ $(prefix)/include/isa-l_crypto/. + install -m 664 isa-l_crypto.h $(prefix)/include/. + install -m 664 include/types.h $(prefix)/include/isa-l_crypto/. + install -m 664 include/endian_helper.h $(prefix)/include/isa-l_crypto/. + install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver) + (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst)) +ifeq ($(shell uname),Darwin) + (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib) + which glibtool && glibtool --mode=finish $(prefix)/lib +else + which libtool && libtool --mode=finish $(prefix)/lib || \ + echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.' +endif + +uninstall: + $(RM) $(prefix)/lib/libisal_crypto.a + $(RM) $(prefix)/lib/$(soname) + $(RM) $(prefix)/lib/$(so_lib_ver) + $(RM) $(prefix)/lib/$(so_lib_inst) + $(RM) -r $(prefix)/include/isa-l_crypto + $(RM) $(prefix)/include/isa-l_crypto.h + $(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib + +# Collect performance data +rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf + +perf_report: + echo Results for $(rpt_name) >> $(rpt_name) + $(MAKE) -f Makefile.unx -k perf | tee -a $(rpt_name) + @echo Summary: + -grep runtime $(rpt_name) + + +clean: + @echo Cleaning up + @$(RM) -r $(CLEANFILES) + +doc: isa-l_crypto.h + (cat Doxyfile; echo 'PROJECT_NUMBER=$(version)') | doxygen - + $(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log + cp generated_doc/latex/refman.pdf isa-l_crypto_api_$(version).pdf diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am new file mode 100644 index 000000000..423f12945 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am @@ -0,0 +1,98 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += md5_mb/md5_ctx_sse.c \ + md5_mb/md5_ctx_avx.c \ + md5_mb/md5_ctx_avx2.c \ + md5_mb/md5_ctx_base.c + +lsrc_x86_64 += md5_mb/md5_mb_mgr_init_sse.c \ + md5_mb/md5_mb_mgr_init_avx2.c \ + md5_mb/md5_mb_mgr_init_avx512.c + +lsrc_x86_64 += md5_mb/md5_mb_mgr_submit_sse.asm \ + md5_mb/md5_mb_mgr_submit_avx.asm \ + md5_mb/md5_mb_mgr_submit_avx2.asm \ + md5_mb/md5_mb_mgr_flush_sse.asm \ + md5_mb/md5_mb_mgr_flush_avx.asm \ + md5_mb/md5_mb_mgr_flush_avx2.asm \ + md5_mb/md5_mb_x4x2_sse.asm \ + md5_mb/md5_mb_x4x2_avx.asm \ + md5_mb/md5_mb_x8x2_avx2.asm \ + md5_mb/md5_multibinary.asm + +lsrc_x86_64 += md5_mb/md5_mb_mgr_submit_avx512.asm \ + md5_mb/md5_mb_mgr_flush_avx512.asm \ + md5_mb/md5_mb_x16x2_avx512.asm \ + md5_mb/md5_ctx_avx512.c + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += md5_mb/md5_ctx_base.c \ + md5_mb/aarch64/md5_ctx_aarch64_asimd.c \ + md5_mb/aarch64/md5_mb_aarch64_dispatcher.c \ + md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c \ + md5_mb/aarch64/md5_mb_asimd_x4.S \ + md5_mb/aarch64/md5_mb_asimd_x1.S \ + md5_mb/aarch64/md5_mb_multibinary.S + + +lsrc_base_aliases += md5_mb/md5_ctx_base.c \ + md5_mb/md5_ctx_base_aliases.c +src_include += -I $(srcdir)/md5_mb +extern_hdrs += include/md5_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + md5_mb/md5_job.asm \ + md5_mb/md5_mb_mgr_datastruct.asm \ + md5_mb/md5_ref.c \ + include/reg_sizes.asm \ + include/multibinary.asm \ + include/memcpy_inline.h \ + include/intrinreg.h + +check_tests += md5_mb/md5_mb_test \ + md5_mb/md5_mb_rand_test \ + md5_mb/md5_mb_rand_update_test + +unit_tests += md5_mb/md5_mb_rand_ssl_test + +perf_tests += md5_mb/md5_mb_vs_ossl_perf + + +md5_mb_rand_test: md5_ref.o +md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la +md5_mb_rand_update_test: md5_ref.o +md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la +md5_mb_rand_ssl_test: LDLIBS += -lcrypto +md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto +md5_mb_vs_ossl_perf: LDLIBS += -lcrypto +md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c new file mode 100644 index 000000000..e9a708c17 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c @@ -0,0 +1,230 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include "md5_mb.h" +#include "memcpy_inline.h" +void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state); +MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job); +MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state); + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_asimd(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_asimd(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_asimd(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_asimd(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_asimd(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..14ef3a6e6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c @@ -0,0 +1,59 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(md5_ctx_mgr_submit_asimd); + + return PROVIDER_BASIC(md5_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(md5_ctx_mgr_init_asimd); + + return PROVIDER_BASIC(md5_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(md5_ctx_mgr_flush_asimd); + + return PROVIDER_BASIC(md5_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S new file mode 100644 index 000000000..27d112494 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S @@ -0,0 +1,248 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + + +.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_c,\d_d + mov k,\kl + and tmp0,tmp0,\d_b + movk k,\kh,lsl 16 + eor tmp0,tmp0,\d_d + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_b,\d_c + mov k,\kl + and tmp0,tmp0,\d_d + movk k,\kh,lsl 16 + eor tmp0,tmp0,\d_c + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_b,\d_c + mov k,\kl + eor tmp0,tmp0,\d_d + movk k,\kh,lsl 16 + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + orn tmp0,\d_b,\d_d + mov k,\kl + eor tmp0,tmp0,\d_c + movk k,\kh,lsl 16 + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm +/* + variables +*/ + job0 .req x0 + digest_addr .req x0 + len .req w1 + end .req x1 + + buf_adr .req x2 + d_a .req w3 + d_b .req w4 + d_c .req w5 + d_d .req w6 + k .req w7 + m0 .req w8 + m1 .req w9 + m2 .req w10 + m3 .req w11 + m4 .req w12 + m5 .req w13 + m6 .req w14 + m7 .req w15 + m8 .req w19 + m9 .req w20 + m10 .req w21 + m11 .req w22 + m12 .req w23 + m13 .req w24 + m14 .req w25 + m15 .req w26 + + tmp0 .req w27 + tmp1 .req w28 + + d_a1 .req w8 + d_b1 .req w9 + d_c1 .req w15 + d_d1 .req w19 + +/* + void md5_mb_asimd_x1(MD5_JOB * job0,int len) +*/ + .global md5_mb_asimd_x1 + .type md5_mb_asimd_x1, %function +md5_mb_asimd_x1: + cmp len,0 + stp x29, x30, [sp,-96]! + ldr buf_adr,[job0],64 + stp x19, x20, [sp, 16] + add end,buf_adr,end,lsl 6 + stp x21, x22, [sp, 32] + ldp d_a,d_b,[digest_addr] + stp x23, x24, [sp, 48] + ldp d_c,d_d,[digest_addr,8] + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + ble .exit + +.loop_start: + ldp m0,m1,[buf_adr],8 + ldp m2,m3,[buf_adr],8 + round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7 + + ldp m4,m5,[buf_adr],8 + round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12 + ldp m6,m7,[buf_adr],8 + round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17 + ldp m8,m9,[buf_adr],8 + round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22 + ldp m10,m11,[buf_adr],8 + round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7 + ldp m12,m13,[buf_adr],8 + round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12 + ldp m14,m15,[buf_adr],8 + round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17 + round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22 + round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7 + round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12 + round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17 + round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22 + round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7 + round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12 + round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17 + round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22 + + round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5 + round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9 + round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14 + round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20 + round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5 + round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9 + round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14 + round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20 + round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5 + round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9 + round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14 + round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20 + round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5 + round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9 + round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14 + round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20 + + round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4 + round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11 + round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16 + round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23 + round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4 + round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11 + round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16 + round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23 + round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4 + round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11 + round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16 + round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23 + round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4 + round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11 + round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16 + round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23 + + round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6 + round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10 + round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15 + round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21 + round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6 + round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10 + round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15 + round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21 + round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6 + round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10 + round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15 + round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21 + round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6 + ldp d_a1,d_b1,[digest_addr] + round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10 + ldp d_c1,d_d1,[digest_addr,8] + round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15 + round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21 + + cmp buf_adr,end + add d_a,d_a1 ,d_a + str d_a,[digest_addr] + add d_b,d_b1 ,d_b + str d_b,[digest_addr,4] + add d_c,d_c1 ,d_c + str d_c,[digest_addr,8] + add d_d,d_d1 ,d_d + str d_d,[digest_addr,12] + bne .loop_start + +.exit: + ldp x19, x20, [sp, 16] + ldp x21, x22, [sp, 32] + ldp x23, x24, [sp, 48] + ldp x25, x26, [sp, 64] + ldp x27, x28, [sp, 80] + ldp x29, x30, [sp], 96 + ret + .size md5_mb_asimd_x1, .-md5_mb_asimd_x1 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S new file mode 100644 index 000000000..53979131d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S @@ -0,0 +1,526 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + +.macro add_key_rol a:req,b:req,k:req,w:req,r:req + add v_tmp0.4s,v_\k\().4s,v_\w\().4s + add v_tmp1.4s,v_tmp1.4s,v_\a\().4s + add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s + shl v_tmp0.4s,v_tmp1.4s,\r + ushr v_tmp1.4s,v_tmp1.4s,32-\r + orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b + + add v_\a\().4s,v_\b\().4s,v_tmp0.4s +.endm +.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + mov v_tmp1.16b, v_\b\().16b + bsl v_tmp1.16b, v_\c\().16b, v_\d\().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + mov v_tmp1.16b, v_\d\().16b + bsl v_tmp1.16b, v_\b\().16b, v_\c\().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + eor v_tmp1.16b,v_\b\().16b,v_\c\().16b + eor v_tmp1.16b,v_tmp1.16b,v_\d\().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req + orn v_tmp1.16b,v_\b\().16b,v_\d\().16b + eor v_tmp1.16b,v_tmp1.16b,v_\c\().16b + .ifnb \k1 + ldr q_\k1,[key_adr],16 + .endif + add_key_rol \a,\b,\k,\w,\r +.endm +/* + variables +*/ + declare_var_vector_reg tmp0, 0 + declare_var_vector_reg tmp1, 1 + declare_var_vector_reg k, 2 + declare_var_vector_reg k1, 3 + declare_var_vector_reg a, 4 + declare_var_vector_reg b, 5 + declare_var_vector_reg c, 6 + declare_var_vector_reg d, 7 + declare_var_vector_reg a1, 8 + declare_var_vector_reg b1, 9 + declare_var_vector_reg c1, 10 + declare_var_vector_reg d1, 11 + + declare_var_vector_reg w0, 16 + declare_var_vector_reg w1, 17 + declare_var_vector_reg w2, 18 + declare_var_vector_reg w3, 19 + declare_var_vector_reg w4, 20 + declare_var_vector_reg w5, 21 + declare_var_vector_reg w6, 22 + declare_var_vector_reg w7, 23 + declare_var_vector_reg w8, 24 + declare_var_vector_reg w9, 25 + declare_var_vector_reg w10, 26 + declare_var_vector_reg w11, 27 + declare_var_vector_reg w12, 28 + declare_var_vector_reg w13, 29 + declare_var_vector_reg w14, 30 + declare_var_vector_reg w15, 31 + + len .req w4 + len_x .req x4 + lane0 .req x5 + lane1 .req x6 + lane2 .req x7 + lane3 .req x9 + end .req x4 + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + key_adr .req x10 + +/* + void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1, + MD5_JOB * job2, MD5_JOB * job3, int len) +*/ + .global md5_mb_asimd_x4 + .type md5_mb_asimd_x4, %function +md5_mb_asimd_x4: + stp x29,x30,[sp,-48]! + ldr lane0,[job0],64 + stp d8,d9,[sp,16] + ldr lane1,[job1],64 + stp d10,d11,[sp,32] + ldr lane2,[job2],64 + cmp len,0 + ldr lane3,[job3],64 + ble .exit + + //load digests + ld4 {v_a.s-v_d.s}[0],[job0] + add end,lane0,len_x,lsl 6 + ld4 {v_a.s-v_d.s}[1],[job1] + ld4 {v_a.s-v_d.s}[2],[job2] + ld4 {v_a.s-v_d.s}[3],[job3] +.loop_start: + ld1 {v_w0.s}[0],[lane0],4 + mov v_a1.16b,v_a.16b + ld1 {v_w0.s}[1],[lane1],4 + mov v_b1.16b,v_b.16b + ld1 {v_w0.s}[2],[lane2],4 + mov v_c1.16b,v_c.16b + ld1 {v_w0.s}[3],[lane3],4 + mov v_d1.16b,v_d.16b + + ld3 {v_w1.s-v_w3.s}[0],[lane0],12 + adrp key_adr,.key_consts + ld3 {v_w1.s-v_w3.s}[1],[lane1],12 + add key_adr,key_adr,#:lo12:.key_consts + ld3 {v_w1.s-v_w3.s}[2],[lane2],12 + ldr q_k,[key_adr],16 + ld3 {v_w1.s-v_w3.s}[3],[lane3],12 + + + ld4 {v_w4.s-v_w7.s}[0], [lane0],16 + + round_0_15 a,b,c,d,k,k1,w0,7 + + ld4 {v_w4.s-v_w7.s}[1], [lane1],16 + round_0_15 d,a,b,c,k1,k,w1,12 + ld4 {v_w4.s-v_w7.s}[2], [lane2],16 + round_0_15 c,d,a,b,k,k1,w2,17 + ld4 {v_w4.s-v_w7.s}[3], [lane3],16 + round_0_15 b,c,d,a,k1,k,w3,22 + ld4 {v_w8.s-v_w11.s}[0],[lane0],16 + round_0_15 a,b,c,d,k,k1,w4,7 + ld4 {v_w8.s-v_w11.s}[1],[lane1],16 + round_0_15 d,a,b,c,k1,k,w5,12 + ld4 {v_w8.s-v_w11.s}[2],[lane2],16 + round_0_15 c,d,a,b,k,k1,w6,17 + ld4 {v_w8.s-v_w11.s}[3],[lane3],16 + round_0_15 b,c,d,a,k1,k,w7,22 + ld4 {v_w12.s-v_w15.s}[0],[lane0],16 + round_0_15 a,b,c,d,k,k1,w8,7 + ld4 {v_w12.s-v_w15.s}[1],[lane1],16 + round_0_15 d,a,b,c,k1,k,w9,12 + ld4 {v_w12.s-v_w15.s}[2],[lane2],16 + round_0_15 c,d,a,b,k,k1,w10,17 + ld4 {v_w12.s-v_w15.s}[3],[lane3],16 + round_0_15 b,c,d,a,k1,k,w11,22 + round_0_15 a,b,c,d,k,k1,w12,7 + round_0_15 d,a,b,c,k1,k,w13,12 + round_0_15 c,d,a,b,k,k1,w14,17 + round_0_15 b,c,d,a,k1,k,w15,22 + + round_16_31 a,b,c,d,k,k1,w1,5 + round_16_31 d,a,b,c,k1,k,w6,9 + round_16_31 c,d,a,b,k,k1,w11,14 + round_16_31 b,c,d,a,k1,k,w0,20 + round_16_31 a,b,c,d,k,k1,w5,5 + round_16_31 d,a,b,c,k1,k,w10,9 + round_16_31 c,d,a,b,k,k1,w15,14 + round_16_31 b,c,d,a,k1,k,w4,20 + round_16_31 a,b,c,d,k,k1,w9,5 + round_16_31 d,a,b,c,k1,k,w14,9 + round_16_31 c,d,a,b,k,k1,w3,14 + round_16_31 b,c,d,a,k1,k,w8,20 + round_16_31 a,b,c,d,k,k1,w13,5 + round_16_31 d,a,b,c,k1,k,w2,9 + round_16_31 c,d,a,b,k,k1,w7,14 + round_16_31 b,c,d,a,k1,k,w12,20 + + round_32_47 a,b,c,d,k,k1,w5,4 + round_32_47 d,a,b,c,k1,k,w8,11 + round_32_47 c,d,a,b,k,k1,w11,16 + round_32_47 b,c,d,a,k1,k,w14,23 + round_32_47 a,b,c,d,k,k1,w1,4 + round_32_47 d,a,b,c,k1,k,w4,11 + round_32_47 c,d,a,b,k,k1,w7,16 + round_32_47 b,c,d,a,k1,k,w10,23 + round_32_47 a,b,c,d,k,k1,w13,4 + round_32_47 d,a,b,c,k1,k,w0,11 + round_32_47 c,d,a,b,k,k1,w3,16 + round_32_47 b,c,d,a,k1,k,w6,23 + round_32_47 a,b,c,d,k,k1,w9,4 + round_32_47 d,a,b,c,k1,k,w12,11 + round_32_47 c,d,a,b,k,k1,w15,16 + round_32_47 b,c,d,a,k1,k,w2,23 + + round_48_63 a,b,c,d,k,k1,w0,6 + round_48_63 d,a,b,c,k1,k,w7,10 + round_48_63 c,d,a,b,k,k1,w14,15 + round_48_63 b,c,d,a,k1,k,w5,21 + round_48_63 a,b,c,d,k,k1,w12,6 + round_48_63 d,a,b,c,k1,k,w3,10 + round_48_63 c,d,a,b,k,k1,w10,15 + round_48_63 b,c,d,a,k1,k,w1,21 + round_48_63 a,b,c,d,k,k1,w8,6 + round_48_63 d,a,b,c,k1,k,w15,10 + round_48_63 c,d,a,b,k,k1,w6,15 + round_48_63 b,c,d,a,k1,k,w13,21 + round_48_63 a,b,c,d,k,k1,w4,6 + round_48_63 d,a,b,c,k1,k,w11,10 + round_48_63 c,d,a,b,k,k1,w2,15 + round_48_63 b,c,d,a,k1, ,w9,21 + + + + + cmp lane0,end + add v_a.4s,v_a1.4s,v_a.4s + add v_b.4s,v_b1.4s,v_b.4s + add v_c.4s,v_c1.4s,v_c.4s + add v_d.4s,v_d1.4s,v_d.4s + bne .loop_start + + st4 {v_a.s-v_d.s}[0],[job0] + st4 {v_a.s-v_d.s}[1],[job1] + st4 {v_a.s-v_d.s}[2],[job2] + st4 {v_a.s-v_d.s}[3],[job3] +.exit: + ldp d8,d9,[sp,16] + ldp d10,d11,[sp,32] + ldp x29,x30,[sp],48 + ret +.key_consts: + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0x242070db + .word 0x242070db + .word 0x242070db + .word 0x242070db + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xf57c0faf + .word 0xf57c0faf + .word 0xf57c0faf + .word 0xf57c0faf + .word 0x4787c62a + .word 0x4787c62a + .word 0x4787c62a + .word 0x4787c62a + .word 0xa8304613 + .word 0xa8304613 + .word 0xa8304613 + .word 0xa8304613 + .word 0xfd469501 + .word 0xfd469501 + .word 0xfd469501 + .word 0xfd469501 + .word 0x698098d8 + .word 0x698098d8 + .word 0x698098d8 + .word 0x698098d8 + .word 0x8b44f7af + .word 0x8b44f7af + .word 0x8b44f7af + .word 0x8b44f7af + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0x895cd7be + .word 0x895cd7be + .word 0x895cd7be + .word 0x895cd7be + .word 0x6b901122 + .word 0x6b901122 + .word 0x6b901122 + .word 0x6b901122 + .word 0xfd987193 + .word 0xfd987193 + .word 0xfd987193 + .word 0xfd987193 + .word 0xa679438e + .word 0xa679438e + .word 0xa679438e + .word 0xa679438e + .word 0x49b40821 + .word 0x49b40821 + .word 0x49b40821 + .word 0x49b40821 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xc040b340 + .word 0xc040b340 + .word 0xc040b340 + .word 0xc040b340 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xd62f105d + .word 0xd62f105d + .word 0xd62f105d + .word 0xd62f105d + .word 0x02441453 + .word 0x02441453 + .word 0x02441453 + .word 0x02441453 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0x455a14ed + .word 0x455a14ed + .word 0x455a14ed + .word 0x455a14ed + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0xfffa3942 + .word 0xfffa3942 + .word 0xfffa3942 + .word 0xfffa3942 + .word 0x8771f681 + .word 0x8771f681 + .word 0x8771f681 + .word 0x8771f681 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0xfde5380c + .word 0xfde5380c + .word 0xfde5380c + .word 0xfde5380c + .word 0xa4beea44 + .word 0xa4beea44 + .word 0xa4beea44 + .word 0xa4beea44 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0x04881d05 + .word 0x04881d05 + .word 0x04881d05 + .word 0x04881d05 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xf4292244 + .word 0xf4292244 + .word 0xf4292244 + .word 0xf4292244 + .word 0x432aff97 + .word 0x432aff97 + .word 0x432aff97 + .word 0x432aff97 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0xffeff47d + .word 0xffeff47d + .word 0xffeff47d + .word 0xffeff47d + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xa3014314 + .word 0xa3014314 + .word 0xa3014314 + .word 0xa3014314 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0xeb86d391 + .word 0xeb86d391 + .word 0xeb86d391 + .word 0xeb86d391 + .size md5_mb_asimd_x4, .-md5_mb_asimd_x4 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c new file mode 100644 index 000000000..5289cd91f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c @@ -0,0 +1,187 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define MD5_MB_CE_MAX_LANES 4 +void md5_mb_asimd_x4(MD5_JOB *, MD5_JOB *, MD5_JOB *, MD5_JOB *, int); +void md5_mb_asimd_x1(MD5_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes[0] = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) { + state->unused_lanes[0] <<= 4; + state->unused_lanes[0] |= MD5_MB_CE_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > MD5_MB_CE_MAX_LANES is invalid lane + for (; i < MD5_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int md5_mb_mgr_do_jobs(MD5_MB_JOB_MGR * state) +{ + int lane_idx, len, i; + + if (state->num_lanes_inuse == 0) { + return -1; + } + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + md5_mb_asimd_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + //only return the min length job + for (i = 0; i < MD5_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + } else { + for (i = 0; i < MD5_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + len = state->lens[i] & (~0xf); + md5_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4); + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + return i; + } + } + } + return -1; + +} + +static MD5_JOB *md5_mb_mgr_free_lane(MD5_MB_JOB_MGR * state) +{ + int i; + MD5_JOB *ret = NULL; + + for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes[0] <<= 4; + state->unused_lanes[0] |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void md5_mb_mgr_insert_job(MD5_MB_JOB_MGR * state, MD5_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes[0] & 0xf; + //fatal error + assert(lane_idx < MD5_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes[0] >>= 4; + state->num_lanes_inuse++; +} + +MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + MD5_JOB *ret; + + //add job into lanes + md5_mb_mgr_insert_job(state, job); + + ret = md5_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < MD5_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = md5_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + md5_mb_mgr_do_jobs(state); +#endif + + ret = md5_mb_mgr_free_lane(state); + return ret; +} + +MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state) +{ + MD5_JOB *ret; + ret = md5_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + md5_mb_mgr_do_jobs(state); + return md5_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S new file mode 100644 index 000000000..b66320f5c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface md5_ctx_mgr_submit +mbin_interface md5_ctx_mgr_init +mbin_interface md5_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c new file mode 100644 index 000000000..ac03a6705 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c @@ -0,0 +1,263 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1)); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx_slver_02020183; +struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 }; + +struct slver md5_ctx_mgr_submit_avx_slver_02020184; +struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 }; + +struct slver md5_ctx_mgr_flush_avx_slver_02020185; +struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c new file mode 100644 index 000000000..cdc910c0d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c @@ -0,0 +1,263 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx2(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1)); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx2_slver_04020186; +struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 }; + +struct slver md5_ctx_mgr_submit_avx2_slver_04020187; +struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 }; + +struct slver md5_ctx_mgr_flush_avx2_slver_04020188; +struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c new file mode 100644 index 000000000..682c2ed5e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c @@ -0,0 +1,267 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx512(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1)); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx512_slver_0600018c; +struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 }; + +struct slver md5_ctx_mgr_submit_avx512_slver_0600018d; +struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 }; + +struct slver md5_ctx_mgr_flush_avx512_slver_0600018e; +struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c new file mode 100644 index 000000000..c1d2a2738 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c @@ -0,0 +1,291 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "md5_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (c ^ (d & (b ^ c))) +#define F3(b,c,d) (b ^ c ^ d) +#define F4(b,c,d) (c ^ (b | ~d)) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define step(i,a,b,c,d,f,k,w,r) \ + if (i < 16) {f = F1(b,c,d); } else \ + if (i < 32) {f = F2(b,c,d); } else \ + if (i < 48) {f = F3(b,c,d); } else \ + {f = F4(b,c,d); } \ + f = a + f + k + to_le32(w); \ + a = b + rol32(f, r); + +static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX md5_single(const void *data, uint32_t digest[4]); +static inline void hash_init_digest(MD5_WORD_T * digest); + +void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr) +{ +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + md5_init(ctx, buffer, len); + md5_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + md5_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = md5_update(ctx, buffer, len); + md5_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + md5_init(ctx, buffer, len); + remain_len = md5_update(ctx, buffer, len); + md5_final(ctx, remain_len); + } + + return ctx; +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + while (remain_len >= 64) { + md5_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + 64); + remain_len -= 64; + ctx->total_length += 64; + } + + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[128]; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < 120; j++) + buf[j] = 0; + + if (i > 64 - 8) + i = 128; + else + i = 64; + + *(uint64_t *) (buf + i - 8) = to_le64((uint64_t) ctx->total_length * 8); + + md5_single(buf, digest); + if (i == 128) { + md5_single(buf + 64, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +static void md5_single(const void *data, uint32_t digest[4]) +{ + + uint32_t a, b, c, d; + uint32_t f; + uint32_t *w = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + + step(0, a, b, c, d, f, 0xd76aa478, w[0], 7); + step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12); + step(2, c, d, a, b, f, 0x242070db, w[2], 17); + step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22); + step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7); + step(5, d, a, b, c, f, 0x4787c62a, w[5], 12); + step(6, c, d, a, b, f, 0xa8304613, w[6], 17); + step(7, b, c, d, a, f, 0xfd469501, w[7], 22); + step(8, a, b, c, d, f, 0x698098d8, w[8], 7); + step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12); + step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17); + step(11, b, c, d, a, f, 0x895cd7be, w[11], 22); + step(12, a, b, c, d, f, 0x6b901122, w[12], 7); + step(13, d, a, b, c, f, 0xfd987193, w[13], 12); + step(14, c, d, a, b, f, 0xa679438e, w[14], 17); + step(15, b, c, d, a, f, 0x49b40821, w[15], 22); + + step(16, a, b, c, d, f, 0xf61e2562, w[1], 5); + step(17, d, a, b, c, f, 0xc040b340, w[6], 9); + step(18, c, d, a, b, f, 0x265e5a51, w[11], 14); + step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20); + step(20, a, b, c, d, f, 0xd62f105d, w[5], 5); + step(21, d, a, b, c, f, 0x02441453, w[10], 9); + step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14); + step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20); + step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5); + step(25, d, a, b, c, f, 0xc33707d6, w[14], 9); + step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14); + step(27, b, c, d, a, f, 0x455a14ed, w[8], 20); + step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5); + step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9); + step(30, c, d, a, b, f, 0x676f02d9, w[7], 14); + step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20); + + step(32, a, b, c, d, f, 0xfffa3942, w[5], 4); + step(33, d, a, b, c, f, 0x8771f681, w[8], 11); + step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16); + step(35, b, c, d, a, f, 0xfde5380c, w[14], 23); + step(36, a, b, c, d, f, 0xa4beea44, w[1], 4); + step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11); + step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16); + step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23); + step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4); + step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11); + step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16); + step(43, b, c, d, a, f, 0x04881d05, w[6], 23); + step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4); + step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11); + step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16); + step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23); + + step(48, a, b, c, d, f, 0xf4292244, w[0], 6); + step(49, d, a, b, c, f, 0x432aff97, w[7], 10); + step(50, c, d, a, b, f, 0xab9423a7, w[14], 15); + step(51, b, c, d, a, f, 0xfc93a039, w[5], 21); + step(52, a, b, c, d, f, 0x655b59c3, w[12], 6); + step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10); + step(54, c, d, a, b, f, 0xffeff47d, w[10], 15); + step(55, b, c, d, a, f, 0x85845dd1, w[1], 21); + step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6); + step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10); + step(58, c, d, a, b, f, 0xa3014314, w[6], 15); + step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21); + step(60, a, b, c, d, f, 0xf7537e82, w[4], 6); + step(61, d, a, b, c, f, 0xbd3af235, w[11], 10); + step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15); + step(63, b, c, d, a, f, 0xeb86d391, w[9], 21); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_base_slver_0000018f; +struct slver md5_ctx_mgr_init_base_slver = { 0x018f, 0x00, 0x00 }; + +struct slver md5_ctx_mgr_submit_base_slver_00000190; +struct slver md5_ctx_mgr_submit_base_slver = { 0x0190, 0x00, 0x00 }; + +struct slver md5_ctx_mgr_flush_base_slver_00000191; +struct slver md5_ctx_mgr_flush_base_slver = { 0x0191, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c new file mode 100644 index 000000000..42e29ab5a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c @@ -0,0 +1,50 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include "md5_mb.h" +extern void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr); +extern MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr); +extern MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); +void md5_ctx_mgr_init(MD5_HASH_CTX_MGR * mgr) +{ + md5_ctx_mgr_init_base(mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush(MD5_HASH_CTX_MGR * mgr) +{ + return md5_ctx_mgr_flush_base(mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return md5_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c new file mode 100644 index 000000000..1e7e91916 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_sse(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1)); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_sse_slver_00020180; +struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 }; + +struct slver md5_ctx_mgr_submit_sse_slver_00020181; +struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 }; + +struct slver md5_ctx_mgr_flush_sse_slver_00020182; +struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm new file mode 100644 index 000000000..7719946f0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm @@ -0,0 +1,55 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define JOB_MD5 structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; JOB_MD5 + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 4, 4 ; length in bytes +FIELD _result_digest, 4*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +%assign _JOB_MD5_size _FIELD_OFFSET +%assign _JOB_MD5_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm new file mode 100644 index 000000000..6caad6733 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm @@ -0,0 +1,73 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define MD5 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MD5_ARGS_X32 +;;; name size align +FIELD _digest, 4*4*32, 16 ; transposed digest +FIELD _data_ptr, 8*32, 8 ; array of pointers to data +END_FIELDS + +%assign _MD5_ARGS_X8_size _FIELD_OFFSET +%assign _MD5_ARGS_X8_align _STRUCT_ALIGN +%assign _MD5_ARGS_X16_size _FIELD_OFFSET +%assign _MD5_ARGS_X16_align _STRUCT_ALIGN +%assign _MD5_ARGS_X32_size _FIELD_OFFSET +%assign _MD5_ARGS_X32_align _STRUCT_ALIGN +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _MD5_ARGS_X8_size, _MD5_ARGS_X8_align +FIELD _lens, 4*32, 8 +FIELD _unused_lanes, 8*4, 8 +FIELD _ldata, _LANE_DATA_size*32, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..b74646de4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm @@ -0,0 +1,248 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_avx + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_avx +%define idx r8 + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +mk_global md5_mb_mgr_flush_avx, function +md5_mb_mgr_flush_avx: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; if bit (32+3) is set, then all lanes are empty + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..910d5af89 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm @@ -0,0 +1,255 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x8x2_avx2 + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x8x2_avx2 +%define idx rbp + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%define num_lanes_inuse r8 + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +mk_global md5_mb_mgr_flush_avx2, function +md5_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x8x2_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..a0eaf428a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm @@ -0,0 +1,315 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern md5_mb_x16x2_avx512 + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x16_avx512 +%define idx rbp + +%define unused_lanes ymm7 +%define lane r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%define num_lanes_inuse r8 + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +;; Byte shift in MEM addr, read a extra byte [addr+16] +%macro MEM_VPSRLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr + 1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr + 31], byte 0 +%endmacro + +;; Byte shift in MEM addr, read a extra byte [addr-1] +%macro MEM_VPSLLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr-1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr], byte 0 +%endmacro + +align 64 + +; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +mk_global md5_mb_mgr_flush_avx512, function +md5_mb_mgr_flush_avx512: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 31 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 32 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + ; Find min length + vmovdqu ymm5, [state + _lens + 2*32] + vmovdqu ymm6, [state + _lens + 3*32] + + vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C} + vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E} + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + + vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0x3F + shr len2, 6 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_6bits] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + vpsubd ymm5, ymm5, ymm2 + vpsubd ymm6, ymm6, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + vmovdqu [state + _lens + 2*32], ymm5 + vmovdqu [state + _lens + 3*32], ymm6 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x16x2_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov lane, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + + shl lane, 8 + or lane, idx + MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes + mov [state + _unused_lanes], lane + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_6bits: + dq 0x00000000FFFFFFC0, 0x0000000000000000 + dq 0x00000000FFFFFFC0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 +lane_16: dq 16 +lane_17: dq 17 +lane_18: dq 18 +lane_19: dq 19 +lane_20: dq 20 +lane_21: dq 21 +lane_22: dq 22 +lane_23: dq 23 +lane_24: dq 24 +lane_25: dq 25 +lane_26: dq 26 +lane_27: dq 27 +lane_28: dq 28 +lane_29: dq 29 +lane_30: dq 30 +lane_31: dq 31 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_mgr_flush_avx512 +no_md5_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..d3aa25f86 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm @@ -0,0 +1,249 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_sse + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_sse +%define idx r8 + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +mk_global md5_mb_mgr_flush_sse, function +md5_mb_mgr_flush_sse: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; if bit (32+3) is set, then all lanes are empty + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + movdqa xmm0, [state + _lens + 0*16] + movdqa xmm1, [state + _lens + 1*16] + + movdqa xmm2, xmm0 + pminud xmm2, xmm1 ; xmm2 has {D,C,B,A} + palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + pminud xmm2, xmm3 ; xmm2 has {x,x,E,F} + palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + pminud xmm2, xmm3 ; xmm2 has min value in low dword + + movd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + pand xmm2, [rel clear_low_nibble] + pshufd xmm2, xmm2, 0 + + psubd xmm0, xmm2 + psubd xmm1, xmm2 + + movdqa [state + _lens + 0*16], xmm0 + movdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*32] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c new file mode 100644 index 000000000..f41e5efbc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < 16; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c new file mode 100644 index 000000000..5ff02aa76 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c @@ -0,0 +1,44 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0x0706050403020100; + state->unused_lanes[1] = 0x0f0e0d0c0b0a0908; + state->unused_lanes[2] = 0x1716151413121110; + state->unused_lanes[3] = 0x1f1e1d1c1b1a1918; + state->num_lanes_inuse = 0; + for (j = 0; j < 32; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c new file mode 100644 index 000000000..615cd9d76 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..96adcf614 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm @@ -0,0 +1,228 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +extern md5_mb_x4x2_avx + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_avx +%define idx r8 + +%define p r9 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane r10 + +%define lane_data r11 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global md5_mb_mgr_submit_avx, function +md5_mb_mgr_submit_avx: + endbranch + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*32], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..ed9b0588e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm @@ -0,0 +1,239 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x8x2_avx2 + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define lane rsi + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%define lane rdx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2 +%define idx rbp + +%define p r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define num_lanes_inuse r9 + +%define lane_data r10 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global md5_mb_mgr_submit_avx2, function +md5_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*64], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x8x2_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..1bbc2be2c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm @@ -0,0 +1,283 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern md5_mb_x16x2_avx512 + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define lane rsi + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%define lane rdx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in a register not clobberred by md5_mb_x16_avx512 +%define idx rbp + +%define p r11 + +%define unused_lanes ymm7 + +%define job_rax rax +%define len rax + +%define num_lanes_inuse r9 + +%define lane_data r10 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +;; Byte shift in MEM addr, read a extra byte [addr+16] +%macro MEM_VPSRLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr + 1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr + 31], byte 0 +%endmacro + +;; Byte shift in MEM addr, read a extra byte [addr-1] +%macro MEM_VPSLLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr-1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr], byte 0 +%endmacro + +align 64 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global md5_mb_mgr_submit_avx512, function +md5_mb_mgr_submit_avx512: + endbranch + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov lane, [state + _unused_lanes] + and lane, 0x3F + MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov DWORD(len), [job + _len] + + shl len, 6 ; low 5 bits store idx + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*4*16*2], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 32 + jne return_null + +start_loop: + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + ; Find min length + vmovdqu ymm5, [state + _lens + 2*32] + vmovdqu ymm6, [state + _lens + 3*32] + + vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C} + vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E} + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + + vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0x3F + shr len2, 6 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_6bits] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + vpsubd ymm5, ymm5, ymm2 + vpsubd ymm6, ymm6, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + vmovdqu [state + _lens + 2*32], ymm5 + vmovdqu [state + _lens + 3*32], ymm6 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x16x2_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov lane, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + + shl lane, 8 + or lane, idx + MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes + mov [state + _unused_lanes], lane + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_6bits: + dq 0x00000000FFFFFFC0, 0x0000000000000000 + dq 0x00000000FFFFFFC0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_mgr_submit_avx512 +no_md5_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..2a374c7e3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm @@ -0,0 +1,229 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_sse + +[bits 64] +default rel +section .text + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_sse +%define idx r8 + +%define p r9 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane r10 + +%define lane_data r11 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global md5_mb_mgr_submit_sse, function +md5_mb_mgr_submit_sse: + endbranch + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + movdqa [rsp + 8*8 + 16*0], xmm6 + movdqa [rsp + 8*8 + 16*1], xmm7 + movdqa [rsp + 8*8 + 16*2], xmm8 + movdqa [rsp + 8*8 + 16*3], xmm9 + movdqa [rsp + 8*8 + 16*4], xmm10 + movdqa [rsp + 8*8 + 16*5], xmm11 + movdqa [rsp + 8*8 + 16*6], xmm12 + movdqa [rsp + 8*8 + 16*7], xmm13 + movdqa [rsp + 8*8 + 16*8], xmm14 + movdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqu xmm0, [job + _result_digest + 0*16] + movd [state + _args_digest + 4*lane + 0*32], xmm0 + pextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + movdqa xmm0, [state + _lens + 0*16] + movdqa xmm1, [state + _lens + 1*16] + + movdqa xmm2, xmm0 + pminud xmm2, xmm1 ; xmm2 has {D,C,B,A} + palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + pminud xmm2, xmm3 ; xmm2 has {x,x,E,F} + palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + pminud xmm2, xmm3 ; xmm2 has min value in low dword + + movd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + pand xmm2, [rel clear_low_nibble] + pshufd xmm2, xmm2, 0 + + psubd xmm0, xmm2 + psubd xmm1, xmm2 + + movdqa [state + _lens + 0*16], xmm0 + movdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*32] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*8 + 16*0] + movdqa xmm7, [rsp + 8*8 + 16*1] + movdqa xmm8, [rsp + 8*8 + 16*2] + movdqa xmm9, [rsp + 8*8 + 16*3] + movdqa xmm10, [rsp + 8*8 + 16*4] + movdqa xmm11, [rsp + 8*8 + 16*5] + movdqa xmm12, [rsp + 8*8 + 16*6] + movdqa xmm13, [rsp + 8*8 + 16*7] + movdqa xmm14, [rsp + 8*8 + 16*8] + movdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c new file mode 100644 index 000000000..bba868f1a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c @@ -0,0 +1,159 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "md5_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + MD5(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + MD5(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c new file mode 100644 index 000000000..d19246138 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c @@ -0,0 +1,202 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "md5_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS]; + +// Compare against reference function +extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + md5_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], lens[i]); + + // Run md5_mb test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + md5_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + md5_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c new file mode 100644 index 000000000..2eab61dfa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c @@ -0,0 +1,297 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "md5_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*MD5_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*MD5_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS]; + +extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_md5 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = md5_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = md5_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = md5_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + md5_ref(bufs[i], digest_ref[i], lens[i]); + } + + md5_ctx_mgr_init(mgr); + + // Run md5_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = MD5_BLOCK_SIZE + + MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % MD5_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = md5_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = md5_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % MD5_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = md5_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c new file mode 100644 index 000000000..4f84b6723 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c @@ -0,0 +1,229 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "md5_mb.h" + +typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS]; + +#define MSGS 13 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "Test vector from febooti.com"; +static uint8_t msg2[] = "12345678901234567890" "12345678901234567890" + "12345678901234567890" "12345678901234567890"; +static uint8_t msg3[] = ""; +static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz"; +static uint8_t msg5[] = "message digest"; +static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789"; +static uint8_t msg7[] = "abc"; +static uint8_t msg8[] = "a"; + +static uint8_t msg9[] = ""; +static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz"; +static uint8_t msgB[] = "message digest"; +static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789"; +static uint8_t msgD[] = "abc"; + +static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff }; +static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 }; +static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec }; +static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca }; +static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa }; +static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f }; +static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 }; +static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 }; + +static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec }; +static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca }; +static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa }; +static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f }; +static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 }; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9, + msgA, msgB, msgC, msgD +}; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7, expResultDigest8, expResultDigest9, + expResultDigestA, expResultDigestB, expResultDigestC, + expResultDigestD +}; + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + md5_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = md5_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = md5_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_md5 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c new file mode 100644 index 000000000..0ba50a1d2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c @@ -0,0 +1,129 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "md5_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS]; + +int main(void) +{ + int ret; + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + md5_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + MD5(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("md5_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (md5_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_md5" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer md5 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm new file mode 100644 index 000000000..718572638 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm @@ -0,0 +1,853 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + + +;; code to compute double octal MD5 using AVX512 + +;; Stack must be aligned to 64 bytes before call + +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers zmm0-8, 14-31 + +;; clobbers all GPRs other than arg1 and rbp + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks +;; These are pointers to data block1 and block2 in the stack +; which will ping pong back and forth +%define DPTR1 rbx +%define DPTR2 var2 +%define IDX var1 +%define TBL rax + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +;; Transposed Digest Storage +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define A1 zmm4 +%define B1 zmm5 +%define C1 zmm6 +%define D1 zmm7 + +%define md5c zmm16 + +%define MASK0 zmm17 +%define MASK1 zmm18 + +%define TMP0 zmm20 +%define TMP1 zmm21 + + +;; Data are stored into the Wx after transposition +%define W0 zmm8 +%define W1 zmm9 +%define W2 zmm10 +%define W3 zmm11 +%define W4 zmm12 +%define W5 zmm13 +%define W6 zmm14 +%define W7 zmm15 + +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define MD5_DIGEST_ROW_SIZE (16*4) +%define APPEND(a,b) a %+ b +%define APPEND3(a,b,c) a %+ b %+ c + +;; Temporary registers used during data transposition + +%define RESZ resb 64* +;; Assume stack aligned to 64 bytes before call +;; Therefore FRAMESIZE mod 64 must be 64-8 = 56 +struc STACK +_DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2 +_TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily +_RSP_SAVE: RESQ 1 ; original RSP +endstruc + +%define Z_AA rsp + _DIGEST + 64*0 +%define Z_BB rsp + _DIGEST + 64*1 +%define Z_CC rsp + _DIGEST + 64*2 +%define Z_DD rsp + _DIGEST + 64*3 +%define Z_AA1 rsp + _DIGEST + 64*4 +%define Z_BB1 rsp + _DIGEST + 64*5 +%define Z_CC1 rsp + _DIGEST + 64*6 +%define Z_DD1 rsp + _DIGEST + 64*7 + +%define MD5_DIGEST_ROW_SIZE (32*4) + + +;; +;; MD5 left rotations (number of bits) +;; +%define rot11 7 +%define rot12 12 +%define rot13 17 +%define rot14 22 +%define rot21 5 +%define rot22 9 +%define rot23 14 +%define rot24 20 +%define rot31 4 +%define rot32 11 +%define rot33 16 +%define rot34 23 +%define rot41 6 +%define rot42 10 +%define rot43 15 +%define rot44 21 + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, MASK0 + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, MASK1 + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, MASK0 + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, MASK1 + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, MASK0 + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, MASK1 + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, MASK0 + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, MASK1 + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, MASK0 + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, MASK1 + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, MASK0 + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, MASK1 + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, MASK0 + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, MASK1 + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, MASK0 + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, MASK1 + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_ARGS1 0 +%xdefine TMP_ D1 +%xdefine D1 C1 +%xdefine C1 B1 +%xdefine B1 A1 +%xdefine A1 TMP_ +%endm + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot) +;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx +%macro PROCESS_LOOP 6 +%define %%MD5const %1 +%define %%data %2 +%define %%F_IMMED %3 +%define %%NROT %4 +%define %%TMP_PR0 %5 +%define %%TMP_PR1 %6 + ; a=b+((a+Ft(b,c,d)+Mj+ti)<=1) + +local_func_decl(md5_mb_x16x2_avx512) +md5_mb_x16x2_avx512: + endbranch + mov rax, rsp + sub rsp, STACK_size + and rsp, -64 + mov [rsp + _RSP_SAVE], rax + + mov DPTR1, rsp + lea DPTR2, [rsp + 64*32] + + ;; Load MD5 constant pointer to register + lea TBL, [MD5_TABLE] + vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1] + vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2] + + ;; Preload input data from 16 segments. + xor IDX, IDX + + ;; transpose input onto stack + ;; first 16 lanes read + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + ;; first 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR1+_DATA+(0)*64],W0 + vmovdqa32 [DPTR1+_DATA+(1)*64],W1 + vmovdqa32 [DPTR1+_DATA+(2)*64],W2 + vmovdqa32 [DPTR1+_DATA+(3)*64],W3 + vmovdqa32 [DPTR1+_DATA+(4)*64],W4 + vmovdqa32 [DPTR1+_DATA+(5)*64],W5 + vmovdqa32 [DPTR1+_DATA+(6)*64],W6 + vmovdqa32 [DPTR1+_DATA+(7)*64],W7 + vmovdqa32 [DPTR1+_DATA+(8)*64],W8 + vmovdqa32 [DPTR1+_DATA+(9)*64],W9 + vmovdqa32 [DPTR1+_DATA+(10)*64],W10 + vmovdqa32 [DPTR1+_DATA+(11)*64],W11 + vmovdqa32 [DPTR1+_DATA+(12)*64],W12 + vmovdqa32 [DPTR1+_DATA+(13)*64],W13 + vmovdqa32 [DPTR1+_DATA+(14)*64],W14 + vmovdqa32 [DPTR1+_DATA+(15)*64],W15 + + ;; second 16 lanes read + mov inp0, [IN + 16*8] + mov inp1, [IN + 17*8] + mov inp2, [IN + 18*8] + mov inp3, [IN + 19*8] + mov inp4, [IN + 20*8] + mov inp5, [IN + 21*8] + mov inp6, [IN + 22*8] + mov inp7, [IN + 23*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 24*8] + mov inp1, [IN + 25*8] + mov inp2, [IN + 26*8] + mov inp3, [IN + 27*8] + mov inp4, [IN + 28*8] + mov inp5, [IN + 29*8] + mov inp6, [IN + 30*8] + mov inp7, [IN + 31*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + ;; second 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0 + vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1 + vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2 + vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3 + vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4 + vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5 + vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6 + vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7 + vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8 + vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9 + vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10 + vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11 + vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12 + vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13 + vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14 + vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15 + + ;; Initialize digests + ;; vmovdqu32 replace vmovdqa32 + vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE] + ; Load the digest for each stream (9-16) + vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64] + +.lloop: + ;; Increment IDX to point to next data block (64 bytes per block) + add IDX, 64 + + ; Save digests for later addition + vmovdqa32 [Z_AA], A + vmovdqa32 [Z_BB], B + vmovdqa32 [Z_CC], C + vmovdqa32 [Z_DD], D + vmovdqa32 [Z_AA1], A1 + vmovdqa32 [Z_BB1], B1 + vmovdqa32 [Z_CC1], C1 + vmovdqa32 [Z_DD1], D1 + + sub SIZE, 1 + je .LastLoop + +%assign I 0 +%assign I_fimm 0xCA +%rep 16 ; 0<=I<=15 + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + %assign I_data I + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + ;; first 16 lanes read + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + +%assign I 16 +%assign I_fimm 0xE4 +%rep 16 ; 16<=I<=31 + %assign I_data ((5*I+1) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; first 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR2+_DATA+(0)*64],W0 + vmovdqa32 [DPTR2+_DATA+(1)*64],W1 + vmovdqa32 [DPTR2+_DATA+(2)*64],W2 + vmovdqa32 [DPTR2+_DATA+(3)*64],W3 + vmovdqa32 [DPTR2+_DATA+(4)*64],W4 + vmovdqa32 [DPTR2+_DATA+(5)*64],W5 + vmovdqa32 [DPTR2+_DATA+(6)*64],W6 + vmovdqa32 [DPTR2+_DATA+(7)*64],W7 + vmovdqa32 [DPTR2+_DATA+(8)*64],W8 + vmovdqa32 [DPTR2+_DATA+(9)*64],W9 + vmovdqa32 [DPTR2+_DATA+(10)*64],W10 + vmovdqa32 [DPTR2+_DATA+(11)*64],W11 + vmovdqa32 [DPTR2+_DATA+(12)*64],W12 + vmovdqa32 [DPTR2+_DATA+(13)*64],W13 + vmovdqa32 [DPTR2+_DATA+(14)*64],W14 + vmovdqa32 [DPTR2+_DATA+(15)*64],W15 + +%assign I 32 +%assign I_fimm 0x96 +%rep 16 ; 32<=I<=47 + %assign I_data ((3*I+5) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; second 16 lanes read + mov inp0, [IN + 16*8] + mov inp1, [IN + 17*8] + mov inp2, [IN + 18*8] + mov inp3, [IN + 19*8] + mov inp4, [IN + 20*8] + mov inp5, [IN + 21*8] + mov inp6, [IN + 22*8] + mov inp7, [IN + 23*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 24*8] + mov inp1, [IN + 25*8] + mov inp2, [IN + 26*8] + mov inp3, [IN + 27*8] + mov inp4, [IN + 28*8] + mov inp5, [IN + 29*8] + mov inp6, [IN + 30*8] + mov inp7, [IN + 31*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + +%assign I 48 +%assign I_fimm 0x39 +%rep 16 ; 48<=I<=63 + %assign I_rotX (I/16+1) + %assign I_rotY (I % 4 + 1) + %assign I_data ((7*I) % 16) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; second 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0 + vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1 + vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2 + vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3 + vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4 + vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5 + vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6 + vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7 + vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8 + vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9 + vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10 + vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11 + vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12 + vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13 + vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14 + vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15 + + ; Add old digest + vpaddd A,A,[Z_AA] + vpaddd B,B,[Z_BB] + vpaddd C,C,[Z_CC] + vpaddd D,D,[Z_DD] + vpaddd A1,A1,[Z_AA1] + vpaddd B1,B1,[Z_BB1] + vpaddd C1,C1,[Z_CC1] + vpaddd D1,D1,[Z_DD1] + + ; Swap DPTR1 and DPTR2 + xchg DPTR1, DPTR2 + ;; Proceed to processing of next block + jmp .lloop + +.LastLoop: +%assign I 0 +%assign I_fimm 0xCA +%rep 16 ; 0<=I<=15 + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + %assign I_data I + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 16 +%assign I_fimm 0xE4 +%rep 16 ; 16<=I<=31 + %assign I_data ((5*I+1) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 32 +%assign I_fimm 0x96 +%rep 16 ; 32<=I<=47 + %assign I_data ((3*I+5) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 48 +%assign I_fimm 0x39 +%rep 16 ; 48<=I<=63 + %assign I_rotX (I/16+1) + %assign I_rotY (I % 4 + 1) + %assign I_data ((7*I) % 16) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ; Add old digest + vpaddd A,A,[Z_AA] + vpaddd B,B,[Z_BB] + vpaddd C,C,[Z_CC] + vpaddd D,D,[Z_DD] + vpaddd A1,A1,[Z_AA1] + vpaddd B1,B1,[Z_BB1] + vpaddd C1,C1,[Z_CC1] + vpaddd D1,D1,[Z_DD1] + + ;; update into data pointers +%assign I 0 +%rep 16 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A + vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B + vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C + vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D + ; Store the digest for each stream (9-16) + vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1 + vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1 + vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1 + vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1 + + mov rsp, [rsp + _RSP_SAVE] + ret + +section .data +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee + dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 + dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be + dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa + dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 + dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed + dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c + dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 + dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 + dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 + dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 + dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 + dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_x16x2_avx512 +no_md5_mb_x16x2_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm new file mode 100644 index 000000000..afca137bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm @@ -0,0 +1,783 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +;; code to compute octal MD5 using AVX + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + ;movdqa %%F,%%Z + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + ;movdqa %%F,%%Z + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpcmpeqd %%F,%%F,%%F ; 0xFFFF + vpxor %%F,%%F,%%Z ; pnot %%Z + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + ;movdqa %%tmp, %%reg + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, [%%data] + vpaddd %%A2, %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + vpaddd %%A, %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + vpaddd %%A2, %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vmovdqa %%TMP,[%%data] + vmovdqa %%TMP2,[%%data + 16*16] + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, %%TMP + vpaddd %%A2, %%A2, %%TMP2 + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + vpaddd %%A, %%A, %%FUN + vpaddd %%A2, %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +%ifidn __OUTPUT_FORMAT__, elf64 +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define inp7 rcx +%define mem1 rdx +%else +;; Windows Registers +%define arg1 rcx +%define arg2 rdx +%define inp7 rdi +%define mem1 rsi +%endif +; r8 is not used + +; Common definitions +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define TBL rax +%define IDX rbx +%define mem2 rbp + + + + + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +MEM equ 16*16*2*2 ; two blocks of data stored in stack +; STACK_SIZE must be an odd multiple of 8 bytes in size +STACK_SIZE equ MEM + 16*8 + 8 + +%define AA rsp + MEM + 16*0 +%define BB rsp + MEM + 16*1 +%define CC rsp + MEM + 16*2 +%define DD rsp + MEM + 16*3 +%define AA2 rsp + MEM + 16*4 +%define BB2 rsp + MEM + 16*5 +%define CC2 rsp + MEM + 16*6 +%define DD2 rsp + MEM + 16*7 + +;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word + +;#define NUM_MD5_DIGEST_WORDS 4 +;#define NUM_LANES 8 +;#define MD5_BLOCK_SIZE 64 +; +;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES]; +; +;typedef struct { +; DECLARE_ALIGNED(digest_array digest, 16); +; UINT8* data_ptr[NUM_LANES]; +;} MD5_ARGS_X8; + +; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size) +; arg 1 : pointer to MD5_ARGS_X8 structure +; arg 2 : size (in blocks) ;; assumed to be >= 1 +; +; arg1 and r8 are maintained by this function +; +align 32 +mk_global md5_mb_x4x2_avx, function, internal +md5_mb_x4x2_avx: + endbranch + sub rsp, STACK_SIZE + + ;; Initialize digests + vmovdqu A,[arg1+0*16] + vmovdqu B,[arg1+2*16] + vmovdqu C,[arg1+4*16] + vmovdqu D,[arg1+6*16] + + vmovdqu A2,[arg1+1*16] + vmovdqu B2,[arg1+3*16] + vmovdqu C2,[arg1+5*16] + vmovdqu D2,[arg1+7*16] + + lea TBL, [MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + mov inp4,[arg1 + _data_ptr + 4*8] + mov inp5,[arg1 + _data_ptr + 5*8] + mov inp6,[arg1 + _data_ptr + 6*8] + mov inp7,[arg1 + _data_ptr + 7*8] + + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16],T0 + vmovdqa [mem1+(I*4+1)*16],T1 + vmovdqa [mem1+(I*4+2)*16],T2 + vmovdqa [mem1+(I*4+3)*16],T3 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + + ; save old digests + vmovdqa [AA], A + vmovdqa [BB], B + vmovdqa [CC], C + vmovdqa [DD], D + ; save old digests + vmovdqa [AA2], A2 + vmovdqa [BB2], B2 + vmovdqa [CC2], C2 + vmovdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14 + + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44 + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; write out digests + vmovdqu [arg1+0*16], A + vmovdqu [arg1+2*16], B + vmovdqu [arg1+4*16], C + vmovdqu [arg1+6*16], D + + vmovdqu [arg1+1*16], A2 + vmovdqu [arg1+3*16], B2 + vmovdqu [arg1+5*16], C2 + vmovdqu [arg1+7*16], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + mov [arg1 + _data_ptr + 1*8], inp1 + mov [arg1 + _data_ptr + 2*8], inp2 + mov [arg1 + _data_ptr + 3*8], inp3 + mov [arg1 + _data_ptr + 4*8], inp4 + mov [arg1 + _data_ptr + 5*8], inp5 + mov [arg1 + _data_ptr + 6*8], inp6 + mov [arg1 + _data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +section .data align=64 + +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm new file mode 100644 index 000000000..b3b946634 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm @@ -0,0 +1,779 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +;; code to compute octal MD5 using SSE + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movdqa %%t0, %%r0 + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movdqa %%t1, %%r2 + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movdqa %%r1, %%t0 + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movdqa %%r3, %%r0 + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pand %%F,%%X + pxor %%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pxor %%F,%%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + pcmpeqd %%F,%%F + pxor %%F,%%Z ; pnot %%Z + por %%F,%%X + pxor %%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%tmp, (32-%%imm) + pslld %%reg, %%imm + por %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + paddd %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + paddd %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + paddd %%A, %%FUN + paddd %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +%ifidn __OUTPUT_FORMAT__, elf64 +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define inp7 rcx +%define mem1 rdx +%else +;; Windows Registers +%define arg1 rcx +%define arg2 rdx +%define inp7 rdi +%define mem1 rsi +%endif +; r8 is not used + +; Common definitions +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 + +%define TBL rax +%define IDX rbx +%define mem2 rbp + + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +MEM equ 16*16*2*2 ; two blocks of data stored in stack +; STACK_SIZE must be an odd multiple of 8 bytes in size +STACK_SIZE equ MEM + 16*8 + 8 + +%define AA rsp + MEM + 16*0 +%define BB rsp + MEM + 16*1 +%define CC rsp + MEM + 16*2 +%define DD rsp + MEM + 16*3 +%define AA2 rsp + MEM + 16*4 +%define BB2 rsp + MEM + 16*5 +%define CC2 rsp + MEM + 16*6 +%define DD2 rsp + MEM + 16*7 + +;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word + +;#define NUM_MD5_DIGEST_WORDS 4 +;#define NUM_LANES 8 +;#define MD5_BLOCK_SIZE 64 +; +;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES]; +; +;typedef struct { +; DECLARE_ALIGNED(digest_array digest, 16); +; UINT8* data_ptr[NUM_LANES]; +;} MD5_ARGS_X8; + +; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size) +; arg 1 : pointer to MD5_ARGS_X8 structure +; arg 2 : size (in blocks) ;; assumed to be >= 1 +; +; arg1 and r8 are maintained by this function +; +align 32 +mk_global md5_mb_x4x2_sse, function, internal +md5_mb_x4x2_sse: + endbranch + sub rsp, STACK_SIZE + + ;; Initialize digests + movdqu A,[arg1+0*16] + movdqu B,[arg1+2*16] + movdqu C,[arg1+4*16] + movdqu D,[arg1+6*16] + + ;; Initialize digests + movdqu A2,[arg1+1*16] + movdqu B2,[arg1+3*16] + movdqu C2,[arg1+5*16] + movdqu D2,[arg1+7*16] + + lea TBL, [MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + mov inp4,[arg1 + _data_ptr + 4*8] + mov inp5,[arg1 + _data_ptr + 5*8] + mov inp6,[arg1 + _data_ptr + 6*8] + mov inp7,[arg1 + _data_ptr + 7*8] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16],T0 + movdqa [mem1+(I*4+1)*16],T1 + movdqa [mem1+(I*4+2)*16],T2 + movdqa [mem1+(I*4+3)*16],T3 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16 + 16*16],T0 + movdqa [mem1+(I*4+1)*16 + 16*16],T1 + movdqa [mem1+(I*4+2)*16 + 16*16],T2 + movdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + movdqa [AA], A + movdqa [BB], B + movdqa [CC], C + movdqa [DD], D + ; save old digests + movdqa [AA2], A2 + movdqa [BB2], B2 + movdqa [CC2], C2 + movdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; write out digests + movdqu [arg1+0*16], A + movdqu [arg1+2*16], B + movdqu [arg1+4*16], C + movdqu [arg1+6*16], D + movdqu [arg1+1*16], A2 + movdqu [arg1+3*16], B2 + movdqu [arg1+5*16], C2 + movdqu [arg1+7*16], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + mov [arg1 + _data_ptr + 1*8], inp1 + mov [arg1 + _data_ptr + 2*8], inp2 + mov [arg1 + _data_ptr + 3*8], inp3 + mov [arg1 + _data_ptr + 4*8], inp4 + mov [arg1 + _data_ptr + 5*8], inp5 + mov [arg1 + _data_ptr + 6*8], inp6 + mov [arg1 + _data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +section .data align=64 + +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm new file mode 100644 index 000000000..b5d6a4875 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm @@ -0,0 +1,920 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute double octal MD5 using AVX2 + +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers ymm0-15 + +;; clobbers all GPRs other than arg1 and rbp + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx + %define arg2 rdx + %define reg3 rdi + %define reg4 rsi +%else + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%endif + +;; rbp is not clobbered + +%define state arg1 +%define num_blks arg2 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +;; These are pointers to data block1 and block2 in the stack +; which will ping pong back and forth +%define DPTR1 rbx +%define DPTR2 reg3 + +%define TBL rax +%define IDX reg4 + +;; Transposed Digest Storage +%define Y_A ymm0 +%define Y_B ymm1 +%define Y_C ymm2 +%define Y_D ymm3 +%define Y_A2 ymm4 +%define Y_B2 ymm5 +%define Y_C2 ymm6 +%define Y_D2 ymm7 + +;; Temp YMM registers corresponding to the Temp XMM registers +;; used during the transposition of the digests +%define Y_KTMP1 ymm12 +%define Y_KTMP2 ymm13 +;; Temporary registers used during MD5 round operations +%define Y_FUN ymm8 +%define Y_TMP ymm9 +%define Y_FUN2 ymm10 +%define Y_TMP2 ymm11 + + +;; YMM registers used during data fetching. +;; Data are stored into the stack after transposition +%define Y_DAT0 ymm8 +%define Y_DAT1 ymm9 +%define Y_DAT2 ymm10 +%define Y_DAT3 ymm11 +%define Y_DAT4 ymm12 +%define Y_DAT5 ymm13 +%define Y_DAT6 ymm14 +%define Y_DAT7 ymm15 + +;; Temporary registers used during data transposition +%define Y_DTMP1 ymm0 +%define Y_DTMP2 ymm1 + + +%define RESY resb 32* +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESIZE mod 32 must be 32-8 = 24 +struc STACK +_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2 +_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily +_RSP_SAVE: RESQ 1 ; original RSP +endstruc + + +%define Y_AA rsp + _DIGEST + 32*0 +%define Y_BB rsp + _DIGEST + 32*1 +%define Y_CC rsp + _DIGEST + 32*2 +%define Y_DD rsp + _DIGEST + 32*3 +%define Y_AA2 rsp + _DIGEST + 32*4 +%define Y_BB2 rsp + _DIGEST + 32*5 +%define Y_CC2 rsp + _DIGEST + 32*6 +%define Y_DD2 rsp + _DIGEST + 32*7 + +%define MD5_DIGEST_ROW_SIZE (16*4) + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpcmpeqd %%F,%%F,%%F ; 0xFFFF + vpxor %%F,%%F,%%Z ; pnot %%Z + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%rA %2 +%define %%rB %3 +%define %%rC %4 +%define %%rD %5 +%define %%rA2 %6 +%define %%rB2 %7 +%define %%rC2 %8 +%define %%rD2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vpaddd %%rA, %%rA, %%MD5const + vpaddd %%rA2, %%rA2, %%MD5const + vpaddd %%rA, %%rA, [%%data] + vpaddd %%rA2, %%rA2, [%%data + 16*32] + %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD + %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2 + vpaddd %%rA, %%rA, %%FUN + vpaddd %%rA2, %%rA2, %%FUN2 + PROLD %%rA,%%nrot, %%TMP + PROLD %%rA2,%%nrot, %%TMP2 + vpaddd %%rA, %%rA, %%rB + vpaddd %%rA2, %%rA2, %%rB2 +%endmacro + +align 32 + +; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) + +mk_global md5_mb_x8x2_avx2, function, internal +md5_mb_x8x2_avx2: + endbranch + mov rax, rsp + sub rsp, STACK_size + and rsp, -32 + mov [rsp + _RSP_SAVE], rax + + mov DPTR1, rsp + lea DPTR2, [rsp + 32*32] + + ;; Load MD5 constant pointer to register + lea TBL, [MD5_TABLE] + + ; Initialize index for data retrieval + xor IDX, IDX + + ;; Fetch Pointers to Data Stream 1 to 8 + mov inp0,[state + _data_ptr + 0*8] + mov inp1,[state + _data_ptr + 1*8] + mov inp2,[state + _data_ptr + 2*8] + mov inp3,[state + _data_ptr + 3*8] + mov inp4,[state + _data_ptr + 4*8] + mov inp5,[state + _data_ptr + 5*8] + mov inp6,[state + _data_ptr + 6*8] + mov inp7,[state + _data_ptr + 7*8] + +%assign I 0 +%rep 2 + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7 + +%assign I (I+1) +%endrep + + ;; Fetch Pointers to Data Stream 9 to 16 + mov inp0,[state + _data_ptr + 8*8] + mov inp1,[state + _data_ptr + 9*8] + mov inp2,[state + _data_ptr + 10*8] + mov inp3,[state + _data_ptr + 11*8] + mov inp4,[state + _data_ptr + 12*8] + mov inp5,[state + _data_ptr + 13*8] + mov inp6,[state + _data_ptr + 14*8] + mov inp7,[state + _data_ptr + 15*8] + +%assign I 0 +%rep 2 + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7 + +%assign I (I+1) +%endrep + ;; digests are already transposed + vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ] + + ; Load the digest for each stream (9-16) + vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32] + +lloop: + + ; save old digests to stack + vmovdqa [Y_AA], Y_A + vmovdqa [Y_BB], Y_B + vmovdqa [Y_CC], Y_C + vmovdqa [Y_DD], Y_D + + vmovdqa [Y_AA2], Y_A2 + vmovdqa [Y_BB2], Y_B2 + vmovdqa [Y_CC2], Y_C2 + vmovdqa [Y_DD2], Y_D2 + + ;; Increment IDX to point to next data block (64 bytes per block) + add IDX, 64 + + ;; Update size of remaining blocks to process + sub num_blks, 1 + je lastblock + + ; Perform the 64 rounds of processing ... + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 + + + ;; Fetch Pointers to Data Stream 1 to 8 ?? + mov inp0,[state + _data_ptr + 0*8] + mov inp1,[state + _data_ptr + 1*8] + mov inp2,[state + _data_ptr + 2*8] + mov inp3,[state + _data_ptr + 3*8] + mov inp4,[state + _data_ptr + 4*8] + mov inp5,[state + _data_ptr + 5*8] + mov inp6,[state + _data_ptr + 6*8] + mov inp7,[state + _data_ptr + 7*8] + + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 + +%assign I 0 + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 + +%assign I (I+1) + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 + + ;; Fetch Pointers to Data Stream 9 to 16 + mov inp0,[state + _data_ptr + 8*8] + mov inp1,[state + _data_ptr + 9*8] + mov inp2,[state + _data_ptr + 10*8] + mov inp3,[state + _data_ptr + 11*8] + mov inp4,[state + _data_ptr + 12*8] + mov inp5,[state + _data_ptr + 13*8] + mov inp6,[state + _data_ptr + 14*8] + mov inp7,[state + _data_ptr + 15*8] + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 + +%assign I 0 + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 + +%assign I (I+1) + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + ; Add results to old digest values + + vpaddd Y_A,Y_A,[Y_AA] + vpaddd Y_B,Y_B,[Y_BB] + vpaddd Y_C,Y_C,[Y_CC] + vpaddd Y_D,Y_D,[Y_DD] + + vpaddd Y_A2,Y_A2,[Y_AA2] + vpaddd Y_B2,Y_B2,[Y_BB2] + vpaddd Y_C2,Y_C2,[Y_CC2] + vpaddd Y_D2,Y_D2,[Y_DD2] + + ; Swap DPTR1 and DPTR2 + xchg DPTR1, DPTR2 + + ;; Proceed to processing of next block + jmp lloop + +lastblock: + + ; Perform the 64 rounds of processing ... + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 + + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 + + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [state + _data_ptr + (2*I)*8] + mov inp1, [state + _data_ptr + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [state + _data_ptr + (2*I)*8], inp0 + mov [state + _data_ptr + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + vpaddd Y_A,Y_A,[Y_AA] + vpaddd Y_B,Y_B,[Y_BB] + vpaddd Y_C,Y_C,[Y_CC] + vpaddd Y_D,Y_D,[Y_DD] + + vpaddd Y_A2,Y_A2,[Y_AA2] + vpaddd Y_B2,Y_B2,[Y_BB2] + vpaddd Y_C2,Y_C2,[Y_CC2] + vpaddd Y_D2,Y_D2,[Y_DD2] + + + + vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A + vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B + vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C + vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D + + + vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width + vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2 + vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2 + vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2 + + + ;;;;;;;;;;;;;;;; + ;; Postamble + + + + mov rsp, [rsp + _RSP_SAVE] + + ret + +section .data +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm new file mode 100644 index 000000000..6e31d297a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm @@ -0,0 +1,80 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern md5_ctx_mgr_init_sse +extern md5_ctx_mgr_submit_sse +extern md5_ctx_mgr_flush_sse + +extern md5_ctx_mgr_init_avx +extern md5_ctx_mgr_submit_avx +extern md5_ctx_mgr_flush_avx + +extern md5_ctx_mgr_init_avx2 +extern md5_ctx_mgr_submit_avx2 +extern md5_ctx_mgr_flush_avx2 + +%ifdef HAVE_AS_KNOWS_AVX512 + extern md5_ctx_mgr_init_avx512 + extern md5_ctx_mgr_submit_avx512 + extern md5_ctx_mgr_flush_avx512 +%endif + +extern md5_ctx_mgr_init_base +extern md5_ctx_mgr_submit_base +extern md5_ctx_mgr_flush_base + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface md5_ctx_mgr_init +mbin_interface md5_ctx_mgr_submit +mbin_interface md5_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_base, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512 + mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_base, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512 + mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_base, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512 +%else + mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2 + mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2 + mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2 +%endif + +;; func core, ver, snum +slversion md5_ctx_mgr_init, 00, 04, 0189 +slversion md5_ctx_mgr_submit, 00, 04, 018a +slversion md5_ctx_mgr_flush, 00, 04, 018b diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c new file mode 100644 index 000000000..ed4721107 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c @@ -0,0 +1,186 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference MD5 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +static void OPT_FIX md5_single(const uint8_t * data, uint32_t digest[4]); + +#define H0 0x67452301 +#define H1 0xefcdab89 +#define H2 0x98badcfe +#define H3 0x10325476 + +void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len) +{ + uint32_t i, j; + uint8_t buf[128]; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + + i = len; + while (i >= 64) { + md5_single(input_data, digest); + input_data += 64; + i -= 64; + } + // 0 <= i < 64 + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < 120; j++) + buf[j] = 0; + + if (i > 64 - 8) + i = 128; + else + i = 64; + + *(uint64_t *) (buf + i - 8) = to_le64((uint64_t) len * 8); + + md5_single(buf, digest); + if (i == 128) + md5_single(buf + 64, digest); +} + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (c ^ (d & (b ^ c))) +#define F3(b,c,d) (b ^ c ^ d) +#define F4(b,c,d) (c ^ (b | ~d)) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define step(i,a,b,c,d,f,k,w,r) \ + if (i < 16) {f = F1(b,c,d); } else \ + if (i < 32) {f = F2(b,c,d); } else \ + if (i < 48) {f = F3(b,c,d); } else \ + {f = F4(b,c,d); } \ + f = a + f + k + to_le32(w); \ + a = b + rol32(f, r); + +void md5_single(const uint8_t * data, uint32_t digest[4]) +{ + uint32_t a, b, c, d; + uint32_t f; + uint32_t *w = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + + step(0, a, b, c, d, f, 0xd76aa478, w[0], 7); + step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12); + step(2, c, d, a, b, f, 0x242070db, w[2], 17); + step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22); + step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7); + step(5, d, a, b, c, f, 0x4787c62a, w[5], 12); + step(6, c, d, a, b, f, 0xa8304613, w[6], 17); + step(7, b, c, d, a, f, 0xfd469501, w[7], 22); + step(8, a, b, c, d, f, 0x698098d8, w[8], 7); + step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12); + step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17); + step(11, b, c, d, a, f, 0x895cd7be, w[11], 22); + step(12, a, b, c, d, f, 0x6b901122, w[12], 7); + step(13, d, a, b, c, f, 0xfd987193, w[13], 12); + step(14, c, d, a, b, f, 0xa679438e, w[14], 17); + step(15, b, c, d, a, f, 0x49b40821, w[15], 22); + + step(16, a, b, c, d, f, 0xf61e2562, w[1], 5); + step(17, d, a, b, c, f, 0xc040b340, w[6], 9); + step(18, c, d, a, b, f, 0x265e5a51, w[11], 14); + step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20); + step(20, a, b, c, d, f, 0xd62f105d, w[5], 5); + step(21, d, a, b, c, f, 0x02441453, w[10], 9); + step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14); + step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20); + step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5); + step(25, d, a, b, c, f, 0xc33707d6, w[14], 9); + step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14); + step(27, b, c, d, a, f, 0x455a14ed, w[8], 20); + step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5); + step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9); + step(30, c, d, a, b, f, 0x676f02d9, w[7], 14); + step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20); + + step(32, a, b, c, d, f, 0xfffa3942, w[5], 4); + step(33, d, a, b, c, f, 0x8771f681, w[8], 11); + step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16); + step(35, b, c, d, a, f, 0xfde5380c, w[14], 23); + step(36, a, b, c, d, f, 0xa4beea44, w[1], 4); + step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11); + step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16); + step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23); + step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4); + step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11); + step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16); + step(43, b, c, d, a, f, 0x04881d05, w[6], 23); + step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4); + step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11); + step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16); + step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23); + + step(48, a, b, c, d, f, 0xf4292244, w[0], 6); + step(49, d, a, b, c, f, 0x432aff97, w[7], 10); + step(50, c, d, a, b, f, 0xab9423a7, w[14], 15); + step(51, b, c, d, a, f, 0xfc93a039, w[5], 21); + step(52, a, b, c, d, f, 0x655b59c3, w[12], 6); + step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10); + step(54, c, d, a, b, f, 0xffeff47d, w[10], 15); + step(55, b, c, d, a, f, 0x85845dd1, w[1], 21); + step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6); + step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10); + step(58, c, d, a, b, f, 0xa3014314, w[6], 15); + step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21); + step(60, a, b, c, d, f, 0xf7537e82, w[4], 6); + step(61, d, a, b, c, f, 0xbd3af235, w[11], 10); + step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15); + step(63, b, c, d, a, f, 0xeb86d391, w[9], 21); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am new file mode 100644 index 000000000..696e9c57d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am @@ -0,0 +1,83 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_mh_sha1_base = \ + mh_sha1/mh_sha1_block_base.c \ + mh_sha1/mh_sha1_finalize_base.c \ + mh_sha1/mh_sha1_update_base.c \ + mh_sha1/sha1_for_mh_sha1.c \ + mh_sha1/mh_sha1.c + +lsrc_x86_64 += \ + $(lsrc_mh_sha1_base) \ + mh_sha1/mh_sha1_multibinary.asm \ + mh_sha1/mh_sha1_block_sse.asm \ + mh_sha1/mh_sha1_block_avx.asm \ + mh_sha1/mh_sha1_block_avx2.asm \ + mh_sha1/mh_sha1_block_avx512.asm \ + mh_sha1/mh_sha1_avx512.c + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += \ + $(lsrc_mh_sha1_base) \ + mh_sha1/aarch64/mh_sha1_multibinary.S \ + mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c \ + mh_sha1/aarch64/mh_sha1_block_asimd.S \ + mh_sha1/aarch64/mh_sha1_asimd.c \ + mh_sha1/aarch64/mh_sha1_block_ce.S \ + mh_sha1/aarch64/mh_sha1_ce.c + +lsrc_base_aliases += \ + $(lsrc_mh_sha1_base) \ + mh_sha1/mh_sha1_base_aliases.c + +other_src += mh_sha1/mh_sha1_ref.c \ + include/reg_sizes.asm \ + include/multibinary.asm \ + include/test.h \ + mh_sha1/mh_sha1_internal.h + +src_include += -I $(srcdir)/mh_sha1 + +extern_hdrs += include/mh_sha1.h + +check_tests += mh_sha1/mh_sha1_test +unit_tests += mh_sha1/mh_sha1_update_test + +perf_tests += mh_sha1/mh_sha1_perf + + +mh_sha1_test: mh_sha1_ref.o +mh_sha1_mh_sha1_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la + +mh_sha1_update_test: mh_sha1_ref.o +mh_sha1_mh_sha1_update_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la + +mh_sha1_mh_sha1_perf_LDADD = libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c new file mode 100644 index 000000000..2ad8871fa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c @@ -0,0 +1,55 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(mh_sha1_update) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(mh_sha1_update_ce); + + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(mh_sha1_update_asimd); + + return PROVIDER_BASIC(mh_sha1_update); + +} + +DEFINE_INTERFACE_DISPATCHER(mh_sha1_finalize) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(mh_sha1_finalize_ce); + + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(mh_sha1_finalize_asimd); + + return PROVIDER_BASIC(mh_sha1_finalize); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c new file mode 100644 index 000000000..c913a64df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c @@ -0,0 +1,53 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include "mh_sha1_internal.h" + +void mh_sha1_block_asimd(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +/***************mh_sha1_update***********/ +// mh_sha1_update_asimd.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_asimd +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_asimd +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************mh_sha1_finalize AND mh_sha1_tail***********/ +// mh_sha1_tail is used to calculate the last incomplete src data block +// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail +// mh_sha1_finalize_asimd.c and mh_sha1_tail_asimd.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_asimd +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_asimd +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_asimd +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S new file mode 100644 index 000000000..22f716f27 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S @@ -0,0 +1,124 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +#include "sha1_asimd_common.S" + +.macro load_x4_word idx:req + ld1 {WORD\idx\().16b},[segs_ptr] + add segs_ptr,segs_ptr,#64 +.endm + +/* + * void mh_sha1_block_asimd (const uint8_t * input_data, + * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + * uint32_t num_blocks); + * arg 0 pointer to input data + * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) + * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. + * arg 3 number of 1KB blocks + */ + + input_data .req x0 + sha1_digest .req x1 + data_buf .req x2 + num_blocks .req w3 + src .req x4 + dst .req x5 + offs .req x6 + mh_segs .req x7 + tmp .req x8 + segs_ptr .req x9 + block_ctr .req w10 + + .global mh_sha1_block_asimd + .type mh_sha1_block_asimd, %function +mh_sha1_block_asimd: + cmp num_blocks, #0 + beq .return + sha1_asimd_save_stack + + mov mh_segs, #0 +.seg_loops: + add segs_ptr,input_data,mh_segs + mov offs, #64 + add src, sha1_digest, mh_segs + ld1 {VA.4S}, [src], offs + ld1 {VB.4S}, [src], offs + ld1 {VC.4S}, [src], offs + ld1 {VD.4S}, [src], offs + ld1 {VE.4S}, [src], offs + mov block_ctr,num_blocks + +.block_loop: + sha1_single + subs block_ctr, block_ctr, 1 + bne .block_loop + + mov offs, #64 + add dst, sha1_digest, mh_segs + st1 {VA.4S}, [dst], offs + st1 {VB.4S}, [dst], offs + st1 {VC.4S}, [dst], offs + st1 {VD.4S}, [dst], offs + st1 {VE.4S}, [dst], offs + + add mh_segs, mh_segs, #16 + cmp mh_segs, #64 + bne .seg_loops + + sha1_asimd_restore_stack +.return: + ret + + .size mh_sha1_block_asimd, .-mh_sha1_block_asimd + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S new file mode 100644 index 000000000..12d3c5df2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S @@ -0,0 +1,384 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + + + +/* +Variable list +*/ + + declare_var_vector_reg lane0_msg_0, 0 + declare_var_vector_reg lane1_msg_0, 1 + declare_var_vector_reg lane2_msg_0, 2 + declare_var_vector_reg lane3_msg_0, 3 + declare_var_vector_reg lane0_msg_1, 4 + declare_var_vector_reg lane1_msg_1, 5 + declare_var_vector_reg lane2_msg_1, 6 + declare_var_vector_reg lane3_msg_1, 7 + declare_var_vector_reg lane0_msg_2, 8 + declare_var_vector_reg lane1_msg_2, 9 + declare_var_vector_reg lane2_msg_2,10 + declare_var_vector_reg lane3_msg_2,11 + declare_var_vector_reg lane0_msg_3,12 + declare_var_vector_reg lane1_msg_3,13 + declare_var_vector_reg lane2_msg_3,14 + declare_var_vector_reg lane3_msg_3,15 + + declare_var_vector_reg lane0_abcd ,16 + declare_var_vector_reg lane1_abcd ,17 + declare_var_vector_reg lane2_abcd ,18 + declare_var_vector_reg lane3_abcd ,19 + declare_var_vector_reg lane0_tmp0 ,20 + declare_var_vector_reg lane1_tmp0 ,21 + declare_var_vector_reg lane2_tmp0 ,22 + declare_var_vector_reg lane3_tmp0 ,23 + declare_var_vector_reg lane0_tmp1 ,24 + declare_var_vector_reg lane1_tmp1 ,25 + declare_var_vector_reg lane2_tmp1 ,26 + declare_var_vector_reg lane3_tmp1 ,27 + + + declare_var_vector_reg e0 ,28 + declare_var_vector_reg e1 ,29 + declare_var_vector_reg key ,30 + declare_var_vector_reg tmp ,31 + + key_adr .req x4 + msg_adr .req x5 + block_cnt .req x6 + offs .req x7 + digest_adr .req x16 + tmp0_adr .req x17 + tmp1_adr .req x18 + +/** +maros for round 4-67 +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req + sha1h lane0_\tmp0\()_s, lane0_\abcd\()_s + sha1h lane1_\tmp0\()_s, lane1_\abcd\()_s + sha1h lane2_\tmp0\()_s, lane2_\abcd\()_s + sha1h lane3_\tmp0\()_s, lane3_\abcd\()_s + mov \e0\()_v.S[0],lane0_\tmp0\()_v.S[0] + mov \e0\()_v.S[1],lane1_\tmp0\()_v.S[0] + mov \e0\()_v.S[2],lane2_\tmp0\()_v.S[0] + mov \e0\()_v.S[3],lane3_\tmp0\()_v.S[0] + mov lane0_\tmp0\()_v.S[0],\e1\()_v.S[0] + mov lane1_\tmp0\()_v.S[0],\e1\()_v.S[1] + mov lane2_\tmp0\()_v.S[0],\e1\()_v.S[2] + mov lane3_\tmp0\()_v.S[0],\e1\()_v.S[3] + \inst lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s + \inst lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s + \inst lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s + \inst lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s + ld1 {lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr] + add lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s + add lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s + add lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s + add lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s + st1 {lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr] + sha1su1 lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s + sha1su1 lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s + sha1su1 lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s + sha1su1 lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s + sha1su0 lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s + sha1su0 lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s + sha1su0 lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s + sha1su0 lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s + +.endm + + +/* + void mh_sha1_block_ce(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks) +*/ +/* +Arguements list +*/ + input_data .req x0 + digests .req x1 + frame_buffer .req x2 + num_blocks .req w3 + + .global mh_sha1_block_ce + .type mh_sha1_block_ce, %function +mh_sha1_block_ce: + //save temp vector registers + stp d8, d9, [sp, -128]! + + stp d10, d11, [sp, 16] + stp d12, d13, [sp, 32] + stp d14, d15, [sp, 48] + mov tmp0_adr,frame_buffer + add tmp1_adr,tmp0_adr,128 + + +start_loop: + mov block_cnt,0 + mov msg_adr,input_data +lane_loop: + mov offs,64 + adr key_adr,KEY_0 + //load msg 0 + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs + + add digest_adr,digests,block_cnt + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs + ldr e0_q,[digest_adr] + + //load key_0 + ldr key_q,[key_adr] + + rev32 lane0_msg_0_v.16b,lane0_msg_0_v.16b + rev32 lane1_msg_0_v.16b,lane1_msg_0_v.16b + rev32 lane2_msg_0_v.16b,lane2_msg_0_v.16b + rev32 lane3_msg_0_v.16b,lane3_msg_0_v.16b + rev32 lane0_msg_1_v.16b,lane0_msg_1_v.16b + rev32 lane1_msg_1_v.16b,lane1_msg_1_v.16b + rev32 lane2_msg_1_v.16b,lane2_msg_1_v.16b + rev32 lane3_msg_1_v.16b,lane3_msg_1_v.16b + rev32 lane0_msg_2_v.16b,lane0_msg_2_v.16b + rev32 lane1_msg_2_v.16b,lane1_msg_2_v.16b + rev32 lane2_msg_2_v.16b,lane2_msg_2_v.16b + rev32 lane3_msg_2_v.16b,lane3_msg_2_v.16b + rev32 lane0_msg_3_v.16b,lane0_msg_3_v.16b + rev32 lane1_msg_3_v.16b,lane1_msg_3_v.16b + rev32 lane2_msg_3_v.16b,lane2_msg_3_v.16b + rev32 lane3_msg_3_v.16b,lane3_msg_3_v.16b + + add lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s + add lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s + add lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s + add lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s + st1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr] + + add lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s + add lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s + add lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s + add lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s + + /* rounds 0-3 */ + sha1h lane0_tmp1_s,lane0_abcd_s + sha1h lane1_tmp1_s,lane1_abcd_s + sha1h lane2_tmp1_s,lane2_abcd_s + sha1h lane3_tmp1_s,lane3_abcd_s + mov e1_v.S[0],lane0_tmp1_v.S[0] + mov e1_v.S[1],lane1_tmp1_v.S[0] + mov e1_v.S[2],lane2_tmp1_v.S[0] + mov e1_v.S[3],lane3_tmp1_v.S[0] + mov lane0_tmp1_v.S[0],e0_v.S[0] + mov lane1_tmp1_v.S[0],e0_v.S[1] + mov lane2_tmp1_v.S[0],e0_v.S[2] + mov lane3_tmp1_v.S[0],e0_v.S[3] + sha1c lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s + sha1c lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s + sha1c lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s + sha1c lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s + ld1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr] + add lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s + sha1su0 lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s + add lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s + sha1su0 lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s + add lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s + sha1su0 lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s + add lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s + sha1su0 lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s + st1 {lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr] + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + + + adr key_adr,KEY_1 + ldr key_q,[key_adr] + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + + adr key_adr,KEY_2 + ldr key_q,[key_adr] + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + + adr key_adr,KEY_3 + ldr key_q,[key_adr] + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + + //msg2 and msg1 are free + mov lane0_msg_2_v.S[0],e1_v.S[0] + mov lane1_msg_2_v.S[0],e1_v.S[1] + mov lane2_msg_2_v.S[0],e1_v.S[2] + mov lane3_msg_2_v.S[0],e1_v.S[3] + + /* rounds 68-71 */ + sha1h lane0_msg_1_s,lane0_abcd_s + sha1h lane1_msg_1_s,lane1_abcd_s + sha1h lane2_msg_1_s,lane2_abcd_s + sha1h lane3_msg_1_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s + sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s + sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s + sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s + add lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s + add lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s + add lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s + add lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s + sha1su1 lane0_msg_0_v.4s,lane0_msg_3_v.4s + sha1su1 lane1_msg_0_v.4s,lane1_msg_3_v.4s + sha1su1 lane2_msg_0_v.4s,lane2_msg_3_v.4s + sha1su1 lane3_msg_0_v.4s,lane3_msg_3_v.4s + + /* rounds 72-75 */ + sha1h lane0_msg_2_s,lane0_abcd_s + sha1h lane1_msg_2_s,lane1_abcd_s + sha1h lane2_msg_2_s,lane2_abcd_s + sha1h lane3_msg_2_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s + sha1p lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s + sha1p lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s + sha1p lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s + + /* rounds 76-79 */ + sha1h lane0_msg_1_s,lane0_abcd_s + sha1h lane1_msg_1_s,lane1_abcd_s + sha1h lane2_msg_1_s,lane2_abcd_s + sha1h lane3_msg_1_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s + sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s + sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s + sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s + add digest_adr,digests,block_cnt + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr] + + add lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S + add lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S + add lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S + add lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S + + add lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S + add lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S + add lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S + add lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S + + add digest_adr,digests,block_cnt + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs + st4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr] + + add block_cnt,block_cnt,16 + cmp block_cnt,64 + add msg_adr,input_data,block_cnt + add digest_adr,digests,block_cnt + bcc lane_loop + + subs num_blocks,num_blocks,1 + add input_data,input_data,1024 + bhi start_loop +exit_func: + //restore temp register + ldp d10, d11, [sp, 16] + ldp d12, d13, [sp, 32] + ldp d14, d15, [sp, 48] + ldp d8, d9, [sp], 128 + ret + + .size mh_sha1_block_ce, .-mh_sha1_block_ce + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c new file mode 100644 index 000000000..c35daeab0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c @@ -0,0 +1,53 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include "mh_sha1_internal.h" + +void mh_sha1_block_ce(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +/***************mh_sha1_update***********/ +// mh_sha1_update_ce.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_ce +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_ce +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************mh_sha1_finalize AND mh_sha1_tail***********/ +// mh_sha1_tail is used to calculate the last incomplete src data block +// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail +// mh_sha1_finalize_ce.c and mh_sha1_tail_ce.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_ce +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_ce +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_ce +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S new file mode 100644 index 000000000..9a6d0caea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface mh_sha1_update +mbin_interface mh_sha1_finalize diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S new file mode 100644 index 000000000..c8b8dd982 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor VF.16b, VC.16b, VD.16b + and VF.16b, VB.16b, VF.16b + eor VF.16b, VD.16b, VF.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor VF.16b, VB.16b, VC.16b + eor VF.16b, VF.16b, VD.16b +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and vT0.16b, VB.16b, VC.16b + and vT1.16b, VB.16b, VD.16b + and vT2.16b, VC.16b, VD.16b + orr VF.16b, vT0.16b, vT1.16b + orr VF.16b, VF.16b, vT2.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_x4_word \windex + .endif +.endm + +// FUNC_F0 is merged into STEP_00_15 for efficiency +.macro SHA1_STEP_00_15_F0 windex:req + rev32 WORD\windex\().16b,WORD\windex\().16b + next_word=\windex+1 + load_next_word %next_word + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, VA.4s, 32 - 5 + add VE.4s, VE.4s, VK.4s + sli VT.4s, VA.4s, 5 + eor VF.16b, VC.16b, VD.16b + add VE.4s, VE.4s, WORD\windex\().4s + and VF.16b, VB.16b, VF.16b + add VE.4s, VE.4s, VT.4s + eor VF.16b, VD.16b, VF.16b + ushr VT.4s, VB.4s, 32 - 30 + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor vT0.16b,\reg_3\().16b,\reg_8\().16b + eor VT.16b,\reg_14\().16b,\reg_16\().16b + eor vT0.16b,vT0.16b,VT.16b + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, vT0.4s, 32 - 1 + add VE.4s, VE.4s, VK.4s + ushr vT1.4s, VA.4s, 32 - 5 + sli VT.4s, vT0.4s, 1 + add VE.4s, VE.4s, VT.4s + sli vT1.4s, VA.4s, 5 + mov \reg_16\().16b,VT.16b + add VE.4s, VE.4s, vT1.4s + ushr VT.4s, VB.4s, 32 - 30 + \func_f + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + + VA .req v0 + VB .req v1 + VC .req v2 + VD .req v3 + VE .req v4 + VT .req v5 + VF .req v6 + VK .req v7 + WORD0 .req v8 + WORD1 .req v9 + WORD2 .req v10 + WORD3 .req v11 + WORD4 .req v12 + WORD5 .req v13 + WORD6 .req v14 + WORD7 .req v15 + WORD8 .req v16 + WORD9 .req v17 + WORD10 .req v18 + WORD11 .req v19 + WORD12 .req v20 + WORD13 .req v21 + WORD14 .req v22 + WORD15 .req v23 + vT0 .req v24 + vT1 .req v25 + vT2 .req v26 + vAA .req v27 + vBB .req v28 + vCC .req v29 + vDD .req v30 + vEE .req v31 + TT .req v0 + sha1key_adr .req x15 + +.macro SWAP_STATES + // shifted VB is held in VT after each step + .unreq TT + TT .req VE + .unreq VE + VE .req VD + .unreq VD + VD .req VC + .unreq VC + VC .req VT + .unreq VT + VT .req VB + .unreq VB + VB .req VA + .unreq VA + VA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15_F0 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES + + .if \windex == 79 + // after 80 steps, the registers ABCDET has shifted from + // its orignal order of 012345 to 341520 + // have to swap back for both compile- and run-time correctness + mov v0.16b,v3.16b + .unreq VA + VA .req v0 + + mov vT0.16b,v2.16b + mov v2.16b,v1.16b + mov v1.16b,v4.16b + .unreq VB + VB .req v1 + .unreq VC + VC .req v2 + + mov v3.16b,v5.16b + .unreq VD + VD .req v3 + + mov v4.16b,vT0.16b + .unreq VE + VE .req v4 + + .unreq VT + VT .req v5 + .endif +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.macro sha1_single + load_x4_word 0 + + mov vAA.16B, VA.16B + mov vBB.16B, VB.16B + mov vCC.16B, VC.16B + mov vDD.16B, VD.16B + mov vEE.16B, VE.16B + + adr sha1key_adr, KEY_0 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add VA.4s, vAA.4s, VA.4s + add VB.4s, vBB.4s, VB.4s + add VC.4s, vCC.4s, VC.4s + add VD.4s, vDD.4s, VD.4s + add VE.4s, vEE.4s, VE.4s +.endm + +.macro sha1_asimd_save_stack + stp d8,d9,[sp, -64]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] + stp d14,d15,[sp, 48] +.endm + +.macro sha1_asimd_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d14,d15,[sp, 48] + ldp d8,d9,[sp],64 +.endm diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c new file mode 100644 index 000000000..e5d8ad86d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c @@ -0,0 +1,141 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_internal.h" + +int mh_sha1_init(struct mh_sha1_ctx *ctx) +{ + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint32_t i; + + if (ctx == NULL) + return MH_SHA1_CTX_ERROR_NULL; + + memset(ctx, 0, sizeof(*ctx)); + + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + for (i = 0; i < HASH_SEGS; i++) { + mh_sha1_segs_digests[0][i] = MH_SHA1_H0; + mh_sha1_segs_digests[1][i] = MH_SHA1_H1; + mh_sha1_segs_digests[2][i] = MH_SHA1_H2; + mh_sha1_segs_digests[3][i] = MH_SHA1_H3; + mh_sha1_segs_digests[4][i] = MH_SHA1_H4; + } + + return MH_SHA1_CTX_ERROR_NONE; +} + +#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \ + || defined( _M_X64) || defined(_M_IX86)) +/***************mh_sha1_update***********/ +// mh_sha1_update_sse.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_sse +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +// mh_sha1_update_avx.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +// mh_sha1_update_avx2.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx2 +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2 +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************mh_sha1_finalize AND mh_sha1_tail***********/ +// mh_sha1_tail is used to calculate the last incomplete src data block +// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail + +// mh_sha1_finalize_sse.c and mh_sha1_tail_sse.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_sse +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +// mh_sha1_finalize_avx.c and mh_sha1_tail_avx.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +// mh_sha1_finalize_avx2.c and mh_sha1_tail_avx2.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx2 +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2 +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2 +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************version info***********/ + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +// Version info +struct slver mh_sha1_init_slver_00000271; +struct slver mh_sha1_init_slver = { 0x0271, 0x00, 0x00 }; + +// mh_sha1_update version info +struct slver mh_sha1_update_sse_slver_00000274; +struct slver mh_sha1_update_sse_slver = { 0x0274, 0x00, 0x00 }; + +struct slver mh_sha1_update_avx_slver_02000276; +struct slver mh_sha1_update_avx_slver = { 0x0276, 0x00, 0x02 }; + +struct slver mh_sha1_update_avx2_slver_04000278; +struct slver mh_sha1_update_avx2_slver = { 0x0278, 0x00, 0x04 }; + +// mh_sha1_finalize version info +struct slver mh_sha1_finalize_sse_slver_00000275; +struct slver mh_sha1_finalize_sse_slver = { 0x0275, 0x00, 0x00 }; + +struct slver mh_sha1_finalize_avx_slver_02000277; +struct slver mh_sha1_finalize_avx_slver = { 0x0277, 0x00, 0x02 }; + +struct slver mh_sha1_finalize_avx2_slver_04000279; +struct slver mh_sha1_finalize_avx2_slver = { 0x0279, 0x00, 0x04 }; + +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c new file mode 100644 index 000000000..1305d048f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c @@ -0,0 +1,70 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_internal.h" + +#ifdef HAVE_AS_KNOWS_AVX512 + +/***************mh_sha1_update***********/ +// mh_sha1_update_avx512.c +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx512 +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512 +#include "mh_sha1_update_base.c" +#undef MH_SHA1_UPDATE_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************mh_sha1_finalize AND mh_sha1_tail***********/ +// mh_sha1_tail is used to calculate the last incomplete src data block +// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail +// mh_sha1_finalize_avx512.c and mh_sha1_tail_avx512.c +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx512 +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512 +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512 +#include "mh_sha1_finalize_base.c" +#undef MH_SHA1_FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION +#undef MH_SHA1_BLOCK_FUNCTION + +/***************version info***********/ +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// mh_sha1_update version info +struct slver mh_sha1_update_avx512_slver_0600027c; +struct slver mh_sha1_update_avx512_slver = { 0x027c, 0x00, 0x06 }; + +// mh_sha1_finalize version info +struct slver mh_sha1_finalize_avx512_slver_0600027d; +struct slver mh_sha1_finalize_avx512_slver = { 0x027d, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c new file mode 100644 index 000000000..18cd8161b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c @@ -0,0 +1,40 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "mh_sha1_internal.h" +#include +int mh_sha1_update(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len) +{ + return mh_sha1_update_base(ctx, buffer, len); + +} + +int mh_sha1_finalize(struct mh_sha1_ctx *ctx, void *mh_sha1_digest) +{ + return mh_sha1_finalize_base(ctx, mh_sha1_digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm new file mode 100644 index 000000000..f4b5e76a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm @@ -0,0 +1,506 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[%%data + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep | + +align 32 + +;void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha1_block_avx, function, internal +func(mh_sha1_block_avx) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by avx + and rsp, ~0x0F + + %assign I 0 ; copy segs_digests into stack + %rep 5 + VMOVPS A, [mh_digests_p + I*64 + 16*0] + VMOVPS B, [mh_digests_p + I*64 + 16*1] + VMOVPS C, [mh_digests_p + I*64 + 16*2] + VMOVPS D, [mh_digests_p + I*64 + 16*3] + + vmovdqa [rsp + I*64 + 16*0], A + vmovdqa [rsp + I*64 + 16*1], B + vmovdqa [rsp + I*64 + 16*2], C + vmovdqa [rsp + I*64 + 16*3], D + %assign I (I+1) + %endrep + + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4 + %assign I 0 + %rep 16 + VMOVPS T0,[mh_in_p + I*64+0*16] + VMOVPS T1,[mh_in_p + I*64+1*16] + VMOVPS T2,[mh_in_p + I*64+2*16] + VMOVPS T3,[mh_in_p + I*64+3*16] + + vpshufb T0, F + vmovdqa [mh_data_p +(I)*16 +0*256],T0 + vpshufb T1, F + vmovdqa [mh_data_p +(I)*16 +1*256],T1 + vpshufb T2, F + vmovdqa [mh_data_p +(I)*16 +2*256],T2 + vpshufb T3, F + vmovdqa [mh_data_p +(I)*16 +3*256],T3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + ;; Initialize digests + vmovdqa A, [rsp + 0*64 + mh_segs] + vmovdqa B, [rsp + 1*64 + mh_segs] + vmovdqa C, [rsp + 2*64 + mh_segs] + vmovdqa D, [rsp + 3*64 + mh_segs] + vmovdqa E, [rsp + 4*64 + mh_segs] + + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E +;; +;; perform 0-79 steps +;; + vmovdqa K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 16...19 + vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16] + vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16] + %rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] +;; do rounds 20...39 + vmovdqa K, [K20_39] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 40...59 + vmovdqa K, [K40_59] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*1] +;; do rounds 60...79 + vmovdqa K, [K60_79] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + vpaddd A, AA + vpaddd B, BB + vpaddd C, CC + vpaddd D, DD + vpaddd E, EE + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], A + vmovdqa [rsp + 1*64 + mh_segs], B + vmovdqa [rsp + 2*64 + mh_segs], C + vmovdqa [rsp + 3*64 + mh_segs], D + vmovdqa [rsp + 4*64 + mh_segs], E + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 5 + vmovdqa A, [rsp + I*64 + 16*0] + vmovdqa B, [rsp + I*64 + 16*1] + vmovdqa C, [rsp + I*64 + 16*2] + vmovdqa D, [rsp + I*64 + 16*3] + + VMOVPS [mh_digests_p + I*64 + 16*0], A + VMOVPS [mh_digests_p + I*64 + 16*1], B + VMOVPS [mh_digests_p + I*64 + 16*2], C + VMOVPS [mh_digests_p + I*64 + 16*3], D + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm new file mode 100644 index 000000000..fed35d83e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm @@ -0,0 +1,508 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX-2 +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | ((~ B) & D) ) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpand %%regF, %%regB,%%regC + vpandn %%regT, %%regB,%%regD + vpor %%regF, %%regT,%%regF +%endmacro + +;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + + + +;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-%%imm) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[%%data + (%%memW * 32)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 + +%define F ymm5 +%define T0 ymm6 +%define T1 ymm7 +%define T2 ymm8 +%define T3 ymm9 +%define T4 ymm10 +%define T5 ymm11 +%define T6 ymm12 +%define T7 ymm13 +%define T8 ymm14 +%define T9 ymm15 + +%define AA ymm5 +%define BB ymm6 +%define CC ymm7 +%define DD ymm8 +%define EE ymm9 +%define TMP ymm10 +%define FUN ymm11 +%define K ymm12 +%define W14 ymm13 +%define W15 ymm14 +%define W16 ymm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep | + +align 32 + +;void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha1_block_avx2, function, internal +func(mh_sha1_block_avx2) + endbranch + FUNC_SAVE + + ; save rsp + mov RSP_SAVE, rsp + + test loops, loops + jz .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 32 Bytes needed by avx2 + and rsp, ~0x1F + + %assign I 0 ; copy segs_digests into stack + %rep 2 + VMOVPS A, [mh_digests_p + I*32*5 + 32*0] + VMOVPS B, [mh_digests_p + I*32*5 + 32*1] + VMOVPS C, [mh_digests_p + I*32*5 + 32*2] + VMOVPS D, [mh_digests_p + I*32*5 + 32*3] + VMOVPS E, [mh_digests_p + I*32*5 + 32*4] + + vmovdqa [rsp + I*32*5 + 32*0], A + vmovdqa [rsp + I*32*5 + 32*1], B + vmovdqa [rsp + I*32*5 + 32*2], C + vmovdqa [rsp + I*32*5 + 32*3], D + vmovdqa [rsp + I*32*5 + 32*4], E + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vbroadcasti128 F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2 +%assign I 0 +%rep 16 + VMOVPS T0,[mh_in_p + I*64+0*32] + VMOVPS T1,[mh_in_p + I*64+1*32] + + vpshufb T0, T0, F + vmovdqa [mh_data_p +I*32+0*512],T0 + vpshufb T1, T1, F + vmovdqa [mh_data_p +I*32+1*512],T1 +%assign I (I+1) +%endrep + + xor mh_segs, mh_segs ;start from the first 8 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + ;; Initialize digests + vmovdqa A, [rsp + 0*64 + mh_segs] + vmovdqa B, [rsp + 1*64 + mh_segs] + vmovdqa C, [rsp + 2*64 + mh_segs] + vmovdqa D, [rsp + 3*64 + mh_segs] + vmovdqa E, [rsp + 4*64 + mh_segs] + + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E +;; +;; perform 0-79 steps +;; + vpbroadcastq K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32] + vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32] + %rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + PREFETCH_X [mh_in_p + pref+128*1] +;; do rounds 20...39 + vpbroadcastq K, [K20_39] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep +;; do rounds 40...59 + vpbroadcastq K, [K40_59] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*2] + PREFETCH_X [mh_in_p + pref+128*3] +;; do rounds 60...79 + vpbroadcastq K, [K60_79] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + vpaddd A,A, AA + vpaddd B,B, BB + vpaddd C,C, CC + vpaddd D,D, DD + vpaddd E,E, EE + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], A + vmovdqa [rsp + 1*64 + mh_segs], B + vmovdqa [rsp + 2*64 + mh_segs], C + vmovdqa [rsp + 3*64 + mh_segs], D + vmovdqa [rsp + 4*64 + mh_segs], E + + add pref, 512 + + add mh_data_p, 512 + add mh_segs, 32 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 2 + vmovdqa A, [rsp + I*32*5 + 32*0] + vmovdqa B, [rsp + I*32*5 + 32*1] + vmovdqa C, [rsp + I*32*5 + 32*2] + vmovdqa D, [rsp + I*32*5 + 32*3] + vmovdqa E, [rsp + I*32*5 + 32*4] + + VMOVPS [mh_digests_p + I*32*5 + 32*0], A + VMOVPS [mh_digests_p + I*32*5 + 32*1], B + VMOVPS [mh_digests_p + I*32*5 + 32*2], C + VMOVPS [mh_digests_p + I*32*5 + 32*3], D + VMOVPS [mh_digests_p + I*32*5 + 32*4], E + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .rodata align=32 + +align 32 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm new file mode 100644 index 000000000..a72c21661 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm @@ -0,0 +1,406 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX-512 +;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovdqu64 +;SIMD variables definition +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define HH0 zmm5 +%define HH1 zmm6 +%define HH2 zmm7 +%define HH3 zmm8 +%define HH4 zmm9 +%define KT zmm10 +%define XTMP0 zmm11 +%define XTMP1 zmm12 +%define SHUF_MASK zmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;using extra 16 ZMM registers to place the inverse input data +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;macros definition +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%F_IMMED %2 + + ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt + ; E=D, D=C, C=ROTL_30(B), B=A, A=T + + ; Ft + ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) + ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D + ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) + + vmovdqa32 XTMP1, B ; Copy B + vpaddd E, E, %%WT ; E = E + Wt + vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) + vpaddd E, E, KT ; E = E + Wt + Kt + vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A) + vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt + vprold B, B, 30 ; B = ROTL_30(B) + vpaddd E, E, XTMP0 ; E = T + + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79 4 +%define %%WT %1 +%define %%WTp2 %2 +%define %%WTp8 %3 +%define %%WTp13 %4 + ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) + ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) + vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 + vpxord %%WT, %%WT, %%WTp13 + vprold %%WT, %%WT, 1 +%endmacro + +%define APPEND(a,b) a %+ b +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + ; remove unwind info macros + %define func(x) x: endbranch + %macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp + 0*16], xmm6 + movdqa [rsp + 1*16], xmm7 + movdqa [rsp + 2*16], xmm8 + movdqa [rsp + 3*16], xmm9 + movdqa [rsp + 4*16], xmm10 + movdqa [rsp + 5*16], xmm11 + movdqa [rsp + 6*16], xmm12 + movdqa [rsp + 7*16], xmm13 + movdqa [rsp + 8*16], xmm14 + movdqa [rsp + 9*16], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + mov [rsp + 10*16 + 2*8], r14 + mov [rsp + 10*16 + 3*8], r15 + mov [rsp + 10*16 + 4*8], rdi + mov [rsp + 10*16 + 5*8], rsi + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep | + +[bits 64] +section .text +align 32 + +;void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +global mh_sha1_block_avx512 +func(mh_sha1_block_avx512) + endbranch + FUNC_SAVE + + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; align rsp to 64 Bytes needed by avx512 + and rsp, ~0x3f + + ; copy segs_digests into registers. + VMOVPS HH0, [mh_digests_p + 64*0] + VMOVPS HH1, [mh_digests_p + 64*1] + VMOVPS HH2, [mh_digests_p + 64*2] + VMOVPS HH3, [mh_digests_p + 64*3] + VMOVPS HH4, [mh_digests_p + 64*4] + ;a mask used to transform to big-endian data + vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK] + +.block_loop: + ;transform to big-endian data and store on aligned_frame + ;using extra 16 ZMM registers instead of stack +%assign I 0 +%rep 8 +%assign J (I+1) + VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64] + VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64] + + vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK + vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK +%assign I (I+2) +%endrep + + vmovdqa64 A, HH0 + vmovdqa64 B, HH1 + vmovdqa64 C, HH2 + vmovdqa64 D, HH3 + vmovdqa64 E, HH4 + + vmovdqa32 KT, [K00_19] +%assign I 0xCA +%assign J 0 +%assign K 2 +%assign L 8 +%assign M 13 +%assign N 0 +%rep 80 + PROCESS_LOOP APPEND(W,J), I + %if N < 64 + MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %endif + %if N = 19 + vmovdqa32 KT, [K20_39] + %assign I 0x96 + %elif N = 39 + vmovdqa32 KT, [K40_59] + %assign I 0xE8 + %elif N = 59 + vmovdqa32 KT, [K60_79] + %assign I 0x96 + %endif + %if N % 10 = 9 + PREFETCH_X [mh_in_p + 1024+128*(N / 10)] + %endif +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%assign N (N+1) +%endrep + + ; Add old digest + vpaddd HH0,A, HH0 + vpaddd HH1,B, HH1 + vpaddd HH2,C, HH2 + vpaddd HH3,D, HH3 + vpaddd HH4,E, HH4 + + add mh_in_p, 1024 + sub loops, 1 + jne .block_loop + + ; copy segs_digests to mh_digests_p + VMOVPS [mh_digests_p + 64*0], HH0 + VMOVPS [mh_digests_p + 64*1], HH1 + VMOVPS [mh_digests_p + 64*2], HH2 + VMOVPS [mh_digests_p + 64*3], HH3 + VMOVPS [mh_digests_p + 64*4], HH4 + + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + + +section .data align=64 + +align 64 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + +K20_39: dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + +K40_59: dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + +K60_79: dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_mh_sha1_block_avx512 +no_mh_sha1_block_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c new file mode 100644 index 000000000..402c9741a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c @@ -0,0 +1,387 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha1_internal.h" +#include + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Base multi-hash SHA1 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s])) // only used for step 0 ~ 15 +#define update_w(s, i, w) (w[i&15][s] = rol32(w[(i-3)&15][s]^w[(i-8)&15][s]^w[(i-14)&15][s]^w[(i-16)&15][s], 1)) // used for step > 15 +#define update_e_1(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F1(b[s],c[s],d[s]) + K_00_19 + w[i&15][s]) +#define update_e_2(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F2(b[s],c[s],d[s]) + K_20_39 + w[i&15][s]) +#define update_e_3(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F3(b[s],c[s],d[s]) + K_40_59 + w[i&15][s]) +#define update_e_4(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F4(b[s],c[s],d[s]) + K_60_79 + w[i&15][s]) +#define update_b(s, b) (b[s] = rol32(b[s],30)) + +#define STORE_W(i, w, ww) \ + store_w(0, i, w, ww); \ + store_w(1, i, w, ww); \ + store_w(2, i, w, ww); \ + store_w(3, i, w, ww); \ + store_w(4, i, w, ww); \ + store_w(5, i, w, ww); \ + store_w(6, i, w, ww); \ + store_w(7, i, w, ww); \ + store_w(8, i, w, ww); \ + store_w(9, i, w, ww); \ + store_w(10, i, w, ww); \ + store_w(11, i, w, ww); \ + store_w(12, i, w, ww); \ + store_w(13, i, w, ww); \ + store_w(14, i, w, ww); \ + store_w(15, i, w, ww) + +#define UPDATE_W(i, w) \ + update_w(0, i, w); \ + update_w(1, i, w); \ + update_w(2, i, w); \ + update_w(3, i, w); \ + update_w(4, i, w); \ + update_w(5, i, w); \ + update_w(6, i, w); \ + update_w(7, i, w); \ + update_w(8, i, w); \ + update_w(9, i, w); \ + update_w(10, i, w); \ + update_w(11, i, w); \ + update_w(12, i, w); \ + update_w(13, i, w); \ + update_w(14, i, w); \ + update_w(15, i, w) + +#define UPDATE_E1(a, b, c, d, e, i, w) \ + update_e_1(0, a, b, c, d, e, i, w); \ + update_e_1(1, a, b, c, d, e, i, w); \ + update_e_1(2, a, b, c, d, e, i, w); \ + update_e_1(3, a, b, c, d, e, i, w); \ + update_e_1(4, a, b, c, d, e, i, w); \ + update_e_1(5, a, b, c, d, e, i, w); \ + update_e_1(6, a, b, c, d, e, i, w); \ + update_e_1(7, a, b, c, d, e, i, w); \ + update_e_1(8, a, b, c, d, e, i, w); \ + update_e_1(9, a, b, c, d, e, i, w); \ + update_e_1(10, a, b, c, d, e, i, w); \ + update_e_1(11, a, b, c, d, e, i, w); \ + update_e_1(12, a, b, c, d, e, i, w); \ + update_e_1(13, a, b, c, d, e, i, w); \ + update_e_1(14, a, b, c, d, e, i, w); \ + update_e_1(15, a, b, c, d, e, i, w) + +#define UPDATE_E2(a, b, c, d, e, i, w) \ + update_e_2(0, a, b, c, d, e, i, w); \ + update_e_2(1, a, b, c, d, e, i, w); \ + update_e_2(2, a, b, c, d, e, i, w); \ + update_e_2(3, a, b, c, d, e, i, w); \ + update_e_2(4, a, b, c, d, e, i, w); \ + update_e_2(5, a, b, c, d, e, i, w); \ + update_e_2(6, a, b, c, d, e, i, w); \ + update_e_2(7, a, b, c, d, e, i, w); \ + update_e_2(8, a, b, c, d, e, i, w); \ + update_e_2(9, a, b, c, d, e, i, w); \ + update_e_2(10, a, b, c, d, e, i, w); \ + update_e_2(11, a, b, c, d, e, i, w); \ + update_e_2(12, a, b, c, d, e, i, w); \ + update_e_2(13, a, b, c, d, e, i, w); \ + update_e_2(14, a, b, c, d, e, i, w); \ + update_e_2(15, a, b, c, d, e, i, w) + +#define UPDATE_E3(a, b, c, d, e, i, w) \ + update_e_3(0, a, b, c, d, e, i, w); \ + update_e_3(1, a, b, c, d, e, i, w); \ + update_e_3(2, a, b, c, d, e, i, w); \ + update_e_3(3, a, b, c, d, e, i, w); \ + update_e_3(4, a, b, c, d, e, i, w); \ + update_e_3(5, a, b, c, d, e, i, w); \ + update_e_3(6, a, b, c, d, e, i, w); \ + update_e_3(7, a, b, c, d, e, i, w); \ + update_e_3(8, a, b, c, d, e, i, w); \ + update_e_3(9, a, b, c, d, e, i, w); \ + update_e_3(10, a, b, c, d, e, i, w); \ + update_e_3(11, a, b, c, d, e, i, w); \ + update_e_3(12, a, b, c, d, e, i, w); \ + update_e_3(13, a, b, c, d, e, i, w); \ + update_e_3(14, a, b, c, d, e, i, w); \ + update_e_3(15, a, b, c, d, e, i, w) + +#define UPDATE_E4(a, b, c, d, e, i, w) \ + update_e_4(0, a, b, c, d, e, i, w); \ + update_e_4(1, a, b, c, d, e, i, w); \ + update_e_4(2, a, b, c, d, e, i, w); \ + update_e_4(3, a, b, c, d, e, i, w); \ + update_e_4(4, a, b, c, d, e, i, w); \ + update_e_4(5, a, b, c, d, e, i, w); \ + update_e_4(6, a, b, c, d, e, i, w); \ + update_e_4(7, a, b, c, d, e, i, w); \ + update_e_4(8, a, b, c, d, e, i, w); \ + update_e_4(9, a, b, c, d, e, i, w); \ + update_e_4(10, a, b, c, d, e, i, w); \ + update_e_4(11, a, b, c, d, e, i, w); \ + update_e_4(12, a, b, c, d, e, i, w); \ + update_e_4(13, a, b, c, d, e, i, w); \ + update_e_4(14, a, b, c, d, e, i, w); \ + update_e_4(15, a, b, c, d, e, i, w) + +#define UPDATE_B(b) \ + update_b(0, b); \ + update_b(1, b); \ + update_b(2, b); \ + update_b(3, b); \ + update_b(4, b); \ + update_b(5, b); \ + update_b(6, b); \ + update_b(7, b); \ + update_b(8, b); \ + update_b(9, b); \ + update_b(10, b); \ + update_b(11, b); \ + update_b(12, b); \ + update_b(13, b); \ + update_b(14, b); \ + update_b(15, b) + +static inline void step00_15(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS], + uint32_t * ww) +{ + STORE_W(i, w, ww); + UPDATE_E1(a, b, c, d, e, i, w); + UPDATE_B(b); +} + +static inline void step16_19(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS]) +{ + UPDATE_W(i, w); + UPDATE_E1(a, b, c, d, e, i, w); + UPDATE_B(b); + +} + +static inline void step20_39(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS]) +{ + UPDATE_W(i, w); + UPDATE_E2(a, b, c, d, e, i, w); + UPDATE_B(b); +} + +static inline void step40_59(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS]) +{ + UPDATE_W(i, w); + UPDATE_E3(a, b, c, d, e, i, w); + UPDATE_B(b); +} + +static inline void step60_79(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS]) +{ + UPDATE_W(i, w); + UPDATE_E4(a, b, c, d, e, i, w); + UPDATE_B(b); +} + +static inline void init_abcde(uint32_t * xx, uint32_t n, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS]) +{ + xx[0] = digests[n][0]; + xx[1] = digests[n][1]; + xx[2] = digests[n][2]; + xx[3] = digests[n][3]; + xx[4] = digests[n][4]; + xx[5] = digests[n][5]; + xx[6] = digests[n][6]; + xx[7] = digests[n][7]; + xx[8] = digests[n][8]; + xx[9] = digests[n][9]; + xx[10] = digests[n][10]; + xx[11] = digests[n][11]; + xx[12] = digests[n][12]; + xx[13] = digests[n][13]; + xx[14] = digests[n][14]; + xx[15] = digests[n][15]; +} + +static inline void add_abcde(uint32_t * xx, uint32_t n, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS]) +{ + digests[n][0] += xx[0]; + digests[n][1] += xx[1]; + digests[n][2] += xx[2]; + digests[n][3] += xx[3]; + digests[n][4] += xx[4]; + digests[n][5] += xx[5]; + digests[n][6] += xx[6]; + digests[n][7] += xx[7]; + digests[n][8] += xx[8]; + digests[n][9] += xx[9]; + digests[n][10] += xx[10]; + digests[n][11] += xx[11]; + digests[n][12] += xx[12]; + digests[n][13] += xx[13]; + digests[n][14] += xx[14]; + digests[n][15] += xx[15]; +} + +/* + * API to perform 0-79 steps of the multi-hash algorithm for + * a single block of data. The caller is responsible for ensuring + * a full block of data input. + * + * Argument: + * input - the pointer to the data + * digest - the space to hold the digests for all segments. + * + * Return: + * N/A + */ +void mh_sha1_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS], + uint8_t * frame_buffer) +{ + uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS], ee[HASH_SEGS]; + uint32_t *ww = (uint32_t *) input; + uint32_t(*w)[HASH_SEGS]; + + w = (uint32_t(*)[HASH_SEGS]) frame_buffer; + + init_abcde(aa, 0, digests); + init_abcde(bb, 1, digests); + init_abcde(cc, 2, digests); + init_abcde(dd, 3, digests); + init_abcde(ee, 4, digests); + + step00_15(0, aa, bb, cc, dd, ee, w, ww); + step00_15(1, ee, aa, bb, cc, dd, w, ww); + step00_15(2, dd, ee, aa, bb, cc, w, ww); + step00_15(3, cc, dd, ee, aa, bb, w, ww); + step00_15(4, bb, cc, dd, ee, aa, w, ww); + step00_15(5, aa, bb, cc, dd, ee, w, ww); + step00_15(6, ee, aa, bb, cc, dd, w, ww); + step00_15(7, dd, ee, aa, bb, cc, w, ww); + step00_15(8, cc, dd, ee, aa, bb, w, ww); + step00_15(9, bb, cc, dd, ee, aa, w, ww); + step00_15(10, aa, bb, cc, dd, ee, w, ww); + step00_15(11, ee, aa, bb, cc, dd, w, ww); + step00_15(12, dd, ee, aa, bb, cc, w, ww); + step00_15(13, cc, dd, ee, aa, bb, w, ww); + step00_15(14, bb, cc, dd, ee, aa, w, ww); + step00_15(15, aa, bb, cc, dd, ee, w, ww); + + step16_19(16, ee, aa, bb, cc, dd, w); + step16_19(17, dd, ee, aa, bb, cc, w); + step16_19(18, cc, dd, ee, aa, bb, w); + step16_19(19, bb, cc, dd, ee, aa, w); + + step20_39(20, aa, bb, cc, dd, ee, w); + step20_39(21, ee, aa, bb, cc, dd, w); + step20_39(22, dd, ee, aa, bb, cc, w); + step20_39(23, cc, dd, ee, aa, bb, w); + step20_39(24, bb, cc, dd, ee, aa, w); + step20_39(25, aa, bb, cc, dd, ee, w); + step20_39(26, ee, aa, bb, cc, dd, w); + step20_39(27, dd, ee, aa, bb, cc, w); + step20_39(28, cc, dd, ee, aa, bb, w); + step20_39(29, bb, cc, dd, ee, aa, w); + step20_39(30, aa, bb, cc, dd, ee, w); + step20_39(31, ee, aa, bb, cc, dd, w); + step20_39(32, dd, ee, aa, bb, cc, w); + step20_39(33, cc, dd, ee, aa, bb, w); + step20_39(34, bb, cc, dd, ee, aa, w); + step20_39(35, aa, bb, cc, dd, ee, w); + step20_39(36, ee, aa, bb, cc, dd, w); + step20_39(37, dd, ee, aa, bb, cc, w); + step20_39(38, cc, dd, ee, aa, bb, w); + step20_39(39, bb, cc, dd, ee, aa, w); + + step40_59(40, aa, bb, cc, dd, ee, w); + step40_59(41, ee, aa, bb, cc, dd, w); + step40_59(42, dd, ee, aa, bb, cc, w); + step40_59(43, cc, dd, ee, aa, bb, w); + step40_59(44, bb, cc, dd, ee, aa, w); + step40_59(45, aa, bb, cc, dd, ee, w); + step40_59(46, ee, aa, bb, cc, dd, w); + step40_59(47, dd, ee, aa, bb, cc, w); + step40_59(48, cc, dd, ee, aa, bb, w); + step40_59(49, bb, cc, dd, ee, aa, w); + step40_59(50, aa, bb, cc, dd, ee, w); + step40_59(51, ee, aa, bb, cc, dd, w); + step40_59(52, dd, ee, aa, bb, cc, w); + step40_59(53, cc, dd, ee, aa, bb, w); + step40_59(54, bb, cc, dd, ee, aa, w); + step40_59(55, aa, bb, cc, dd, ee, w); + step40_59(56, ee, aa, bb, cc, dd, w); + step40_59(57, dd, ee, aa, bb, cc, w); + step40_59(58, cc, dd, ee, aa, bb, w); + step40_59(59, bb, cc, dd, ee, aa, w); + + step60_79(60, aa, bb, cc, dd, ee, w); + step60_79(61, ee, aa, bb, cc, dd, w); + step60_79(62, dd, ee, aa, bb, cc, w); + step60_79(63, cc, dd, ee, aa, bb, w); + step60_79(64, bb, cc, dd, ee, aa, w); + step60_79(65, aa, bb, cc, dd, ee, w); + step60_79(66, ee, aa, bb, cc, dd, w); + step60_79(67, dd, ee, aa, bb, cc, w); + step60_79(68, cc, dd, ee, aa, bb, w); + step60_79(69, bb, cc, dd, ee, aa, w); + step60_79(70, aa, bb, cc, dd, ee, w); + step60_79(71, ee, aa, bb, cc, dd, w); + step60_79(72, dd, ee, aa, bb, cc, w); + step60_79(73, cc, dd, ee, aa, bb, w); + step60_79(74, bb, cc, dd, ee, aa, w); + step60_79(75, aa, bb, cc, dd, ee, w); + step60_79(76, ee, aa, bb, cc, dd, w); + step60_79(77, dd, ee, aa, bb, cc, w); + step60_79(78, cc, dd, ee, aa, bb, w); + step60_79(79, bb, cc, dd, ee, aa, w); + + add_abcde(aa, 0, digests); + add_abcde(bb, 1, digests); + add_abcde(cc, 2, digests); + add_abcde(dd, 3, digests); + add_abcde(ee, 4, digests); +} + +void mh_sha1_block_base(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i; + + for (i = 0; i < num_blocks; i++) { + mh_sha1_single(input_data, digests, frame_buffer); + input_data += MH_SHA1_BLOCK_SIZE; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm new file mode 100644 index 000000000..3d75d1649 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm @@ -0,0 +1,498 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using SSE +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + paddd %%regE,[%%data + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define MOVPS movups + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep | + +align 32 + +;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha1_block_sse, function, internal +func(mh_sha1_block_sse) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by sse + and rsp, ~0x0F + + %assign I 0 ; copy segs_digests into stack + %rep 5 + MOVPS A, [mh_digests_p + I*64 + 16*0] + MOVPS B, [mh_digests_p + I*64 + 16*1] + MOVPS C, [mh_digests_p + I*64 + 16*2] + MOVPS D, [mh_digests_p + I*64 + 16*3] + + movdqa [rsp + I*64 + 16*0], A + movdqa [rsp + I*64 + 16*1], B + movdqa [rsp + I*64 + 16*2], C + movdqa [rsp + I*64 + 16*3], D + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + movdqa F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4 + %assign I 0 + %rep 16 + MOVPS T0,[mh_in_p + I*64+0*16] + MOVPS T1,[mh_in_p + I*64+1*16] + MOVPS T2,[mh_in_p + I*64+2*16] + MOVPS T3,[mh_in_p + I*64+3*16] + + pshufb T0, F + movdqa [mh_data_p +(I)*16 +0*256],T0 + pshufb T1, F + movdqa [mh_data_p +(I)*16 +1*256],T1 + pshufb T2, F + movdqa [mh_data_p +(I)*16 +2*256],T2 + pshufb T3, F + movdqa [mh_data_p +(I)*16 +3*256],T3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + ;; Initialize digests + movdqa A, [rsp + 0*64 + mh_segs] + movdqa B, [rsp + 1*64 + mh_segs] + movdqa C, [rsp + 2*64 + mh_segs] + movdqa D, [rsp + 3*64 + mh_segs] + movdqa E, [rsp + 4*64 + mh_segs] + + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E +;; +;; perform 0-79 steps +;; + movdqa K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 16...19 + movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16] + movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16] + %rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] +;; do rounds 20...39 + movdqa K, [K20_39] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 40...59 + movdqa K, [K40_59] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*1] +;; do rounds 60...79 + movdqa K, [K60_79] + %rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + paddd A, AA + paddd B, BB + paddd C, CC + paddd D, DD + paddd E, EE + + ; write out digests + movdqa [rsp + 0*64 + mh_segs], A + movdqa [rsp + 1*64 + mh_segs], B + movdqa [rsp + 2*64 + mh_segs], C + movdqa [rsp + 3*64 + mh_segs], D + movdqa [rsp + 4*64 + mh_segs], E + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 5 + movdqa A, [rsp + I*64 + 16*0] + movdqa B, [rsp + I*64 + 16*1] + movdqa C, [rsp + I*64 + 16*2] + movdqa D, [rsp + I*64 + 16*3] + + MOVPS [mh_digests_p + I*64 + 16*0], A + MOVPS [mh_digests_p + I*64 + 16*1], B + MOVPS [mh_digests_p + I*64 + 16*2], C + MOVPS [mh_digests_p + I*64 + 16*3], D + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c new file mode 100644 index 000000000..3058aaa87 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c @@ -0,0 +1,122 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha1_finalize_base.c contains the prototypes of mh_sha1_finalize_XXX + * and mh_sha1_tail_XXX. Default definitions are base type which generates + * mh_sha1_finalize_base and mh_sha1_tail_base. Other types are generated + * through different predefined macros by mh_sha1.c. + * mh_sha1_tail is used to calculate the last incomplete block of input + * data. mh_sha1_finalize is the mh_sha1_ctx wrapper of mh_sha1_tail. + */ +#ifndef MH_SHA1_FINALIZE_FUNCTION +#include +#include "mh_sha1_internal.h" + +#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_base +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base +#define MH_SHA1_FINALIZE_SLVER +#endif + +void MH_SHA1_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer, + uint32_t digests[SHA1_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA1_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) { + MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit; + MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA1 digests (segment digests as input message) + sha1_for_mh_sha1((uint8_t *) mh_sha1_segs_digests, digests, + 4 * SHA1_DIGEST_WORDS * HASH_SEGS); + + return; +} + +int MH_SHA1_FINALIZE_FUNCTION(struct mh_sha1_ctx *ctx, void *mh_sha1_digest) +{ + uint8_t *partial_block_buffer; + uint64_t total_len; + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + + if (ctx == NULL) + return MH_SHA1_CTX_ERROR_NULL; + + total_len = ctx->total_length; + partial_block_buffer = ctx->partial_block_buffer; + + /* mh_sha1 tail */ + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + + MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests, + aligned_frame_buffer, ctx->mh_sha1_digest); + + /* Output the digests of mh_sha1 */ + if (mh_sha1_digest != NULL) { + ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0]; + ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1]; + ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2]; + ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3]; + ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4]; + } + + return MH_SHA1_CTX_ERROR_NONE; +} + +#ifdef MH_SHA1_FINALIZE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha1_finalize_base_slver_0000027b; +struct slver mh_sha1_finalize_base_slver = { 0x027b, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h new file mode 100644 index 000000000..81823048e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h @@ -0,0 +1,308 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA1_INTERNAL_H_ +#define _MH_SHA1_INTERNAL_H_ + +/** + * @file mh_sha1_internal.h + * @brief mh_sha1 internal function prototypes and macros + * + * Interface for mh_sha1 internal functions + * + */ +#include +#include "mh_sha1.h" +#include "endian_helper.h" + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef _MSC_VER +# define inline __inline +#endif + + // 64byte pointer align +#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) ) + + /******************************************************************* + *mh_sha1 constants and macros + ******************************************************************/ + /* mh_sha1 constants */ +#define MH_SHA1_H0 0x67452301UL +#define MH_SHA1_H1 0xefcdab89UL +#define MH_SHA1_H2 0x98badcfeUL +#define MH_SHA1_H3 0x10325476UL +#define MH_SHA1_H4 0xc3d2e1f0UL + +#define K_00_19 0x5a827999UL +#define K_20_39 0x6ed9eba1UL +#define K_40_59 0x8f1bbcdcUL +#define K_60_79 0xca62c1d6UL + + /* mh_sha1 macros */ +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (b ^ c ^ d) +#define F3(b,c,d) ((b & c) | (d & (b | c))) +#define F4(b,c,d) (b ^ c ^ d) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + + /******************************************************************* + * SHA1 API internal function prototypes + ******************************************************************/ + + /** + * @brief Performs complete SHA1 algorithm. + * + * @param input Pointer to buffer containing the input message. + * @param digest Pointer to digest to update. + * @param len Length of buffer. + * @returns None + */ + void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len); + + /******************************************************************* + * mh_sha1 API internal function prototypes + * Multiple versions of Update and Finalize functions are supplied which use + * multiple versions of block and tail process subfunctions. + ******************************************************************/ + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @returns none + * + */ + void mh_sha1_tail(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha1_digest mh_sha1 digest + * @returns none + * + */ + void mh_sha1_tail_base(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * @requires SSE + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha1_digest mh_sha1 digest + * @returns none + * + */ + void mh_sha1_tail_sse(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * @requires AVX + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha1_digest mh_sha1 digest + * @returns none + * + */ + void mh_sha1_tail_avx(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * @requires AVX2 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha1_digest mh_sha1 digest + * @returns none + * + */ + void mh_sha1_tail_avx2(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha1. + * + * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE. + * It will output the final SHA1 digest based on mh_sha1_segs_digests. + * + * @requires AVX512 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha1_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha1_digest mh_sha1 digest + * @returns none + * + */ + void mh_sha1_tail_avx512(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block_base(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * @requires SSE + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * @requires AVX + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * @requires AVX2 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N. + * + * @requires AVX512 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm new file mode 100644 index 000000000..590aa6c5f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm @@ -0,0 +1,77 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +%include "reg_sizes.asm" +%include "multibinary.asm" + +%ifidn __OUTPUT_FORMAT__, elf32 + [bits 32] +%else + default rel + [bits 64] + + extern mh_sha1_update_sse + extern mh_sha1_update_avx + extern mh_sha1_update_avx2 + extern mh_sha1_finalize_sse + extern mh_sha1_finalize_avx + extern mh_sha1_finalize_avx2 + + %ifdef HAVE_AS_KNOWS_AVX512 + extern mh_sha1_update_avx512 + extern mh_sha1_finalize_avx512 + %endif + +%endif + +extern mh_sha1_update_base +extern mh_sha1_finalize_base + +mbin_interface mh_sha1_update +mbin_interface mh_sha1_finalize + +%ifidn __OUTPUT_FORMAT__, elf64 + + %ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2, mh_sha1_update_avx512 + mbin_dispatch_init6 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2, mh_sha1_finalize_avx512 + %else + mbin_dispatch_init5 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2 + mbin_dispatch_init5 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2 + %endif + +%else + mbin_dispatch_init2 mh_sha1_update, mh_sha1_update_base + mbin_dispatch_init2 mh_sha1_finalize, mh_sha1_finalize_base +%endif + +;;; func core, ver, snum +slversion mh_sha1_update, 00, 02, 0272 +slversion mh_sha1_finalize, 00, 02, 0273 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c new file mode 100644 index 000000000..4fd6c09a1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c @@ -0,0 +1,180 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_LEN 16*1024 +# define TEST_LOOPS 20000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_LEN 32*1024*1024 +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#define TEST_MEM TEST_LEN + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_CTX_ERROR_NONE){ \ + printf("The mh_sha1 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha1_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("base: "); + dump((char *)hash_base, 20); + printf("ref: "); + dump((char *)hash_test, 20); + } + + return mh_sha1_fail; +} + +int main(int argc, char *argv[]) +{ + int i, fail = 0; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS]; + uint8_t *buff = NULL; + struct mh_sha1_ctx *update_ctx_test = NULL, *update_ctx_base = NULL; + struct perf start, stop; + + printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n"); + + buff = malloc(TEST_LEN); + update_ctx_test = malloc(sizeof(*update_ctx_test)); + update_ctx_base = malloc(sizeof(*update_ctx_base)); + + if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + // mh_sha1 base version + mh_sha1_init(update_ctx_base); + mh_sha1_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha1_finalize_base(update_ctx_base, hash_base); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS / 10; i++) { + mh_sha1_init(update_ctx_base); + mh_sha1_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha1_finalize_base(update_ctx_base, hash_base); + } + perf_stop(&stop); + printf("mh_sha1_update_base" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + //Update feature test + CHECK_RETURN(mh_sha1_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + CHECK_RETURN(mh_sha1_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + } + perf_stop(&stop); + printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + // Check results + fail = compare_digests(hash_base, hash_test); + + if (fail) { + printf("Fail size=%d\n", TEST_LEN); + return -1; + } + + if (fail) + printf("Test failed function test%d\n", fail); + else + printf("Pass func check\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c new file mode 100644 index 000000000..71caba50e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c @@ -0,0 +1,430 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_internal.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + // Macros and sub-functions which already exist in source code file + // (sha1_for_mh_sha1.c) is part of ISA-L library as internal functions. + // The reason why writing them twice is the linking issue caused by + // mh_sha1_ref(). mh_sha1_ref() needs these macros and sub-functions + // without linking ISA-L library. So mh_sha1_ref() includes them in + // order to contain essential sub-functions in its own object file. +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void OPT_FIX sha1_single_for_mh_sha1_ref(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} + +void sha1_for_mh_sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + + digest[0] = MH_SHA1_H0; + digest[1] = MH_SHA1_H1; + digest[2] = MH_SHA1_H2; + digest[3] = MH_SHA1_H3; + digest[4] = MH_SHA1_H4; + + i = len; + while (i >= SHA1_BLOCK_SIZE) { + sha1_single_for_mh_sha1_ref(input_data, digest); + input_data += SHA1_BLOCK_SIZE; + i -= SHA1_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - 8) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha1_single_for_mh_sha1_ref(buf, digest); + if (i == (2 * SHA1_BLOCK_SIZE)) + sha1_single_for_mh_sha1_ref(buf + SHA1_BLOCK_SIZE, digest); +} + +/* + * buffer to rearrange one segment data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * w0 | w1 | ... | w15 + * + */ +static inline void transform_input_single(uint32_t * new_data, uint32_t * input, + uint32_t segment) +{ + new_data[16 * segment + 0] = input[16 * 0 + segment]; + new_data[16 * segment + 1] = input[16 * 1 + segment]; + new_data[16 * segment + 2] = input[16 * 2 + segment]; + new_data[16 * segment + 3] = input[16 * 3 + segment]; + new_data[16 * segment + 4] = input[16 * 4 + segment]; + new_data[16 * segment + 5] = input[16 * 5 + segment]; + new_data[16 * segment + 6] = input[16 * 6 + segment]; + new_data[16 * segment + 7] = input[16 * 7 + segment]; + new_data[16 * segment + 8] = input[16 * 8 + segment]; + new_data[16 * segment + 9] = input[16 * 9 + segment]; + new_data[16 * segment + 10] = input[16 * 10 + segment]; + new_data[16 * segment + 11] = input[16 * 11 + segment]; + new_data[16 * segment + 12] = input[16 * 12 + segment]; + new_data[16 * segment + 13] = input[16 * 13 + segment]; + new_data[16 * segment + 14] = input[16 * 14 + segment]; + new_data[16 * segment + 15] = input[16 * 15 + segment]; +} + +// Adapt parameters to sha1_single_for_mh_sha1_ref +#define sha1_update_one_seg(data, digest) \ + sha1_single_for_mh_sha1_ref((const uint8_t *)(data), (uint32_t *)(digest)) + +/* + * buffer to Rearrange all segments data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * seg0: | w0 | w1 | ... | w15 + * seg1: | w0 | w1 | ... | w15 + * seg2: | w0 | w1 | ... | w15 + * .... + * seg15: | w0 | w1 | ... | w15 + * + */ +static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block) +{ + uint32_t *current_input = input + block * MH_SHA1_BLOCK_SIZE / 4; + + transform_input_single(new_data, current_input, 0); + transform_input_single(new_data, current_input, 1); + transform_input_single(new_data, current_input, 2); + transform_input_single(new_data, current_input, 3); + transform_input_single(new_data, current_input, 4); + transform_input_single(new_data, current_input, 5); + transform_input_single(new_data, current_input, 6); + transform_input_single(new_data, current_input, 7); + transform_input_single(new_data, current_input, 8); + transform_input_single(new_data, current_input, 9); + transform_input_single(new_data, current_input, 10); + transform_input_single(new_data, current_input, 11); + transform_input_single(new_data, current_input, 12); + transform_input_single(new_data, current_input, 13); + transform_input_single(new_data, current_input, 14); + transform_input_single(new_data, current_input, 15); + +} + +/* + * buffer to Calculate all segments' digests from one block. + * + * Layout of seg_digest: + * segment + * ------------------------- + * seg0: | H0 | H1 | ... | H4 + * seg1: | H0 | H1 | ... | H4 + * seg2: | H0 | H1 | ... | H4 + * .... + * seg15: | H0 | H1 | ... | H4 + * + */ +static inline void sha1_update_all_segs(uint32_t * new_data, + uint32_t(*mh_sha1_seg_digests)[SHA1_DIGEST_WORDS]) +{ + sha1_update_one_seg(&(new_data)[16 * 0], mh_sha1_seg_digests[0]); + sha1_update_one_seg(&(new_data)[16 * 1], mh_sha1_seg_digests[1]); + sha1_update_one_seg(&(new_data)[16 * 2], mh_sha1_seg_digests[2]); + sha1_update_one_seg(&(new_data)[16 * 3], mh_sha1_seg_digests[3]); + sha1_update_one_seg(&(new_data)[16 * 4], mh_sha1_seg_digests[4]); + sha1_update_one_seg(&(new_data)[16 * 5], mh_sha1_seg_digests[5]); + sha1_update_one_seg(&(new_data)[16 * 6], mh_sha1_seg_digests[6]); + sha1_update_one_seg(&(new_data)[16 * 7], mh_sha1_seg_digests[7]); + sha1_update_one_seg(&(new_data)[16 * 8], mh_sha1_seg_digests[8]); + sha1_update_one_seg(&(new_data)[16 * 9], mh_sha1_seg_digests[9]); + sha1_update_one_seg(&(new_data)[16 * 10], mh_sha1_seg_digests[10]); + sha1_update_one_seg(&(new_data)[16 * 11], mh_sha1_seg_digests[11]); + sha1_update_one_seg(&(new_data)[16 * 12], mh_sha1_seg_digests[12]); + sha1_update_one_seg(&(new_data)[16 * 13], mh_sha1_seg_digests[13]); + sha1_update_one_seg(&(new_data)[16 * 14], mh_sha1_seg_digests[14]); + sha1_update_one_seg(&(new_data)[16 * 15], mh_sha1_seg_digests[15]); +} + +void mh_sha1_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i, j; + uint32_t *temp_buffer = (uint32_t *) frame_buffer; + uint32_t(*trans_digests)[SHA1_DIGEST_WORDS]; + + trans_digests = (uint32_t(*)[SHA1_DIGEST_WORDS]) digests; + + // Re-structure seg_digests from 5*16 to 16*5 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + temp_buffer[j * SHA1_DIGEST_WORDS + i] = digests[i][j]; + } + } + memcpy(trans_digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS); + + // Calculate digests for all segments, leveraging sha1 API + for (i = 0; i < num_blocks; i++) { + transform_input(temp_buffer, (uint32_t *) input_data, i); + sha1_update_all_segs(temp_buffer, trans_digests); + } + + // Re-structure seg_digests from 16*5 to 5*16 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i]; + } + } + memcpy(digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha1_tail_ref(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer, + uint32_t digests[SHA1_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA1_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) { + mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit; + mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA1 digests (segment digests as input message) + sha1_for_mh_sha1_ref((uint8_t *) mh_sha1_segs_digests, digests, + 4 * SHA1_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest) +{ + uint64_t total_len; + uint64_t num_blocks; + uint32_t mh_sha1_segs_digests[SHA1_DIGEST_WORDS][HASH_SEGS]; + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE]; + uint8_t partial_block_buffer[MH_SHA1_BLOCK_SIZE * 2]; + uint32_t mh_sha1_hash_dword[SHA1_DIGEST_WORDS]; + uint32_t i; + const uint8_t *input_data = (const uint8_t *)buffer; + + /* Initialize digests of all segments */ + for (i = 0; i < HASH_SEGS; i++) { + mh_sha1_segs_digests[0][i] = MH_SHA1_H0; + mh_sha1_segs_digests[1][i] = MH_SHA1_H1; + mh_sha1_segs_digests[2][i] = MH_SHA1_H2; + mh_sha1_segs_digests[3][i] = MH_SHA1_H3; + mh_sha1_segs_digests[4][i] = MH_SHA1_H4; + } + + total_len = len; + + // Calculate blocks + num_blocks = len / MH_SHA1_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + mh_sha1_block_ref(input_data, mh_sha1_segs_digests, frame_buffer, num_blocks); + len -= num_blocks * MH_SHA1_BLOCK_SIZE; + input_data += num_blocks * MH_SHA1_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + /* Finalize */ + mh_sha1_tail_ref(partial_block_buffer, total_len, mh_sha1_segs_digests, + frame_buffer, mh_sha1_hash_dword); + + // Output the digests of mh_sha1 + if (mh_sha1_digest != NULL) { + mh_sha1_digest[0] = mh_sha1_hash_dword[0]; + mh_sha1_digest[1] = mh_sha1_hash_dword[1]; + mh_sha1_digest[2] = mh_sha1_hash_dword[2]; + mh_sha1_digest[3] = mh_sha1_hash_dword[3]; + mh_sha1_digest[4] = mh_sha1_hash_dword[4]; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c new file mode 100644 index 000000000..792c4452b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c @@ -0,0 +1,217 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_CTX_ERROR_NONE){ \ + printf("The mh_sha1 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest); +#define MH_SHA1_REF mh_sha1_ref + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha1_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 20); + printf("test: "); + dump((char *)hash_test, 20); + } + + return mh_sha1_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS]; + uint8_t *buff = NULL; + int size, offset; + struct mh_sha1_ctx *update_ctx = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + MH_SHA1_REF(buff, TEST_LEN, hash_ref); + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages + for (size = TEST_LEN; size >= 0; size--) { + + // Fill with rand data + rand_buffer(buff, size); + + MH_SHA1_REF(buff, size, hash_ref); + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various buffer offsets and sizes + printf("offset tests"); + for (size = TEST_LEN - 256; size > 256; size -= 11) { + for (offset = 0; offset < 256; offset++) { + MH_SHA1_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + } + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + // Run efence tests + printf("efence tests"); + for (size = TEST_SIZE; size > 0; size--) { + offset = TEST_LEN - size; + + MH_SHA1_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + printf(" %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c new file mode 100644 index 000000000..4af220299 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c @@ -0,0 +1,110 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha1_update_base.c contains the prototype of mh_sha1_update_XXX. + * Default definitions are base type which generates mh_sha1_update_base. + * Other types are generated through different predefined macros by mh_sha1.c. + */ +#ifndef MH_SHA1_UPDATE_FUNCTION +#include "mh_sha1_internal.h" +#include + +#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_base +#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base +#define MH_SHA1_UPDATE_SLVER +#endif + +int MH_SHA1_UPDATE_FUNCTION(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len) +{ + + uint8_t *partial_block_buffer; + uint64_t partial_block_len; + uint64_t num_blocks; + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + const uint8_t *input_data = (const uint8_t *)buffer; + + if (ctx == NULL) + return MH_SHA1_CTX_ERROR_NULL; + + if (len == 0) + return MH_SHA1_CTX_ERROR_NONE; + + partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE; + partial_block_buffer = ctx->partial_block_buffer; + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + + ctx->total_length += len; + // No enough input data for mh_sha1 calculation + if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) { + memcpy(partial_block_buffer + partial_block_len, input_data, len); + return MH_SHA1_CTX_ERROR_NONE; + } + // mh_sha1 calculation for the previous partial block + if (partial_block_len != 0) { + memcpy(partial_block_buffer + partial_block_len, input_data, + MH_SHA1_BLOCK_SIZE - partial_block_len); + //do one_block process + MH_SHA1_BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests, + aligned_frame_buffer, 1); + input_data += MH_SHA1_BLOCK_SIZE - partial_block_len; + len -= MH_SHA1_BLOCK_SIZE - partial_block_len; + memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE); + } + // Calculate mh_sha1 for the current blocks + num_blocks = len / MH_SHA1_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + MH_SHA1_BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer, + num_blocks); + len -= num_blocks * MH_SHA1_BLOCK_SIZE; + input_data += num_blocks * MH_SHA1_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + return MH_SHA1_CTX_ERROR_NONE; + +} + +#ifdef MH_SHA1_UPDATE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + + // Version info +struct slver mh_sha1_update_base_slver_0000027a; +struct slver mh_sha1_update_base_slver = { 0x027a, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c new file mode 100644 index 000000000..942dfd09f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c @@ -0,0 +1,240 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_CTX_ERROR_NONE){ \ + printf("The mh_sha1 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest); + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha1_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 20); + printf("test: "); + dump((char *)hash_test, 20); + } + + return mh_sha1_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0, i; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS]; + uint8_t *buff = NULL; + int update_count; + int size1, size2, offset, addr_offset; + struct mh_sha1_ctx *update_ctx = NULL; + uint8_t *mem_addr = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + mh_sha1_ref(buff, TEST_LEN, hash_ref); + + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages by update twice. + printf("\n various size messages by update twice tests"); + for (size1 = TEST_LEN; size1 >= 0; size1--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size2 = TEST_LEN - size1; // size2 is different with the former + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various update count + printf("\n various update count tests"); + for (update_count = 1; update_count <= TEST_LEN; update_count++) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size1 = TEST_LEN / update_count; + size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former + + CHECK_RETURN(mh_sha1_init(update_ctx)); + for (i = 1, offset = 0; i < update_count; i++) { + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1)); + offset += size1; + } + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // test various start address of ctx. + printf("\n various start address of ctx test"); + free(update_ctx); + mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10); + for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_ref(buff, TEST_LEN, hash_ref); + + // a unaligned offset + update_ctx = (struct mh_sha1_ctx *)(mem_addr + addr_offset); + CHECK_RETURN(mh_sha1_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail addr_offset=%d\n", addr_offset); + return -1; + } + + if ((addr_offset & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c new file mode 100644 index 000000000..224977e6c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c @@ -0,0 +1,204 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha1_internal.h" +#include + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA1 Functions for mh_sha1 +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void OPT_FIX sha1_single_for_mh_sha1(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} + +void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + + digest[0] = MH_SHA1_H0; + digest[1] = MH_SHA1_H1; + digest[2] = MH_SHA1_H2; + digest[3] = MH_SHA1_H3; + digest[4] = MH_SHA1_H4; + + i = len; + while (i >= SHA1_BLOCK_SIZE) { + sha1_single_for_mh_sha1(input_data, digest); + input_data += SHA1_BLOCK_SIZE; + i -= SHA1_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - 8) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha1_single_for_mh_sha1(buf, digest); + if (i == (2 * SHA1_BLOCK_SIZE)) + sha1_single_for_mh_sha1(buf + SHA1_BLOCK_SIZE, digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am new file mode 100644 index 000000000..e6ea6784c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am @@ -0,0 +1,89 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_murmur = mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c + +lsrc_stitch = mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm + +lsrc_stitch += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm + +lsrc_x86_64 += $(lsrc_murmur) \ + $(lsrc_stitch) + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += $(lsrc_murmur) \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S \ + mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S + +lsrc_base_aliases += $(lsrc_murmur) \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c + +other_src += include/reg_sizes.asm \ + include/multibinary.asm \ + include/test.h \ + mh_sha1/mh_sha1_internal.h \ + mh_sha1_murmur3_x64_128/murmur3_x64_128.c \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h + +src_include += -I $(srcdir)/mh_sha1_murmur3_x64_128 + +extern_hdrs += include/mh_sha1_murmur3_x64_128.h + +unit_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test \ + mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test + +perf_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf + + +mh_sha1_murmur3_x64_128_test: mh_sha1_ref.o murmur3_x64_128.o +mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la + +mh_sha1_murmur3_x64_128_update_test: mh_sha1_ref.o murmur3_x64_128.o +mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_update_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la + +mh_sha1_murmur3_x64_128_perf: mh_sha1_ref.o murmur3_x64_128.o +mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_perf_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c new file mode 100644 index 000000000..e6993703a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c @@ -0,0 +1,53 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_update) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(mh_sha1_murmur3_update_ce); + + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(mh_sha1_murmur3_update_asimd); + + return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_update); +} + +DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_finalize) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(mh_sha1_murmur3_finalize_ce); + + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(mh_sha1_murmur3_finalize_asimd); + + return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_finalize); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h new file mode 100644 index 000000000..22b33cbd2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h @@ -0,0 +1,91 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_ +#define _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_ + +/** + * @file mh_sha1_murmur3_aarch64_internal.h + * @brief mh_sha1_murmur3_aarch64 internal function prototypes and macros + * + * Interface for mh_sha1_murmur3_aarch64 internal functions + * + */ +#include +#include "mh_sha1_murmur3_x64_128_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires Crypto Extension + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ +void mh_sha1_murmur3_block_ce(const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t + murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires ASIMD + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ +void mh_sha1_murmur3_block_asimd(const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t + murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c new file mode 100644 index 000000000..9cac8504e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_murmur3_aarch64_internal.h" + +extern void mh_sha1_tail_asimd(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t * frame_buffer, + uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + +extern void mh_sha1_block_asimd(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + +// mh_sha1_murmur3_update_asimd.c +#define UPDATE_FUNCTION mh_sha1_murmur3_update_asimd +#define BLOCK_FUNCTION mh_sha1_murmur3_block_asimd +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +// mh_sha1_murmur3_finalize_asimd.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_asimd +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_asimd +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S new file mode 100644 index 000000000..575129f36 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S @@ -0,0 +1,224 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +#include "sha1_asimd_common.S" +.macro sha1_step_16_79_interleave0 windex:req + // interleaving murmur3 operation + .if (\windex % 4) == 0 + ldp mur_data1, mur_data2, [mur_data], #16 + .endif + .if (\windex % 4) == 1 + /* rotate left by 31 bits */ + ror mur_data1, mur_data1, #64-31 + /* rotate left by 33 bits */ + ror mur_data2, mur_data2, #64-33 + .endif + .if (\windex % 4) == 2 + eor mur_hash1, mur_hash1, mur_data1 + /* rotate left by 27 bits */ + ror mur_hash1, mur_hash1, #64-27 + .endif + .if (\windex % 4) == 3 + eor mur_hash2, mur_hash2, mur_data2 + /* rotate left by 31 bits */ + ror mur_hash2, mur_hash2, #64-31 + .endif +.endm + +.macro sha1_step_16_79_interleave1 windex:req + // interleaving murmur3 operation + .if (\windex % 4) == 0 + mul mur_data1, mur_data1, mur_c1 + mul mur_data2, mur_data2, mur_c2 + .endif + .if (\windex % 4) == 1 + mul mur_data1, mur_data1, mur_c2 + mul mur_data2, mur_data2, mur_c1 + .endif + .if (\windex % 4) == 2 + add mur_hash1, mur_hash1, mur_hash2 + //mur_hash1 = mur_hash1 * 5 + N1 + add mur_hash1, mur_hash1, mur_hash1, LSL #2 + add mur_hash1, mur_n1, mur_hash1 + .endif + .if (\windex % 4) == 3 + add mur_hash2, mur_hash2, mur_hash1 + // mur_hash2 = mur_hash2 * 5 + N2 + add mur_hash2, mur_hash2, mur_hash2, LSL #2 + add mur_hash2, mur_n2, mur_hash2 + .endif +.endm + +.macro load_x4_word idx:req + ld1 {WORD\idx\().16b},[segs_ptr] + add segs_ptr,segs_ptr,#64 +.endm + +/* + * void mh_sha1_murmur3_block_asimd (const uint8_t * input_data, + * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + * uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + * uint32_t num_blocks); + * arg 0 pointer to input data + * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) + * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. + * arg 3 pointer to murmur3 digest + * arg 4 number of 1KB blocks + */ + + input_data .req x0 + sha1_digest .req x1 + data_buf .req x2 + mur_digest .req x3 + num_blocks .req w4 + + src .req x5 + dst .req x6 + offs .req x7 + mh_segs .req x8 + tmp .req x9 + tmpw .req w9 + segs_ptr .req x10 + mur_hash1 .req x11 + mur_hash2 .req x12 + mur_c1 .req x13 + mur_c2 .req x14 + mur_data1 .req x19 + mur_data2 .req x20 + mur_data .req x21 + mur_n1 .req x22 + mur_n1_w .req w22 + mur_n2 .req x23 + mur_n2_w .req w23 + block_ctr .req w24 + + .global mh_sha1_murmur3_block_asimd + .type mh_sha1_murmur3_block_asimd, %function +mh_sha1_murmur3_block_asimd: + cmp num_blocks, #0 + beq .return + sha1_asimd_save_stack + stp x19, x20, [sp, -48]! + stp x21, x22, [sp, 16] + stp x23, x24, [sp, 32] + + mov mur_data, input_data + ldr mur_hash1, [mur_digest] + ldr mur_hash2, [mur_digest, 8] + adr mur_c1, C1 + ldr mur_c1, [mur_c1] + adr mur_c2, C2 + ldr mur_c2, [mur_c2] + adr tmp, N1 + ldr mur_n1_w, [tmp] + adr tmp, N2 + ldr mur_n2_w, [tmp] + + mov mh_segs, #0 +.seg_loops: + add segs_ptr,input_data,mh_segs + mov offs, #64 + add src, sha1_digest, mh_segs + ld1 {VA.4S}, [src], offs + ld1 {VB.4S}, [src], offs + ld1 {VC.4S}, [src], offs + ld1 {VD.4S}, [src], offs + ld1 {VE.4S}, [src], offs + mov block_ctr,num_blocks + +.block_loop: + sha1_single + subs block_ctr, block_ctr, 1 + bne .block_loop + + mov offs, #64 + add dst, sha1_digest, mh_segs + st1 {VA.4S}, [dst], offs + st1 {VB.4S}, [dst], offs + st1 {VC.4S}, [dst], offs + st1 {VD.4S}, [dst], offs + st1 {VE.4S}, [dst], offs + + add mh_segs, mh_segs, #16 + cmp mh_segs, #64 + bne .seg_loops + + /* save murmur-hash digest */ + str mur_hash1, [mur_digest], #8 + str mur_hash2, [mur_digest] + + ldp x21, x22, [sp, 16] + ldp x23, x24, [sp, 32] + ldp x19, x20, [sp], 48 + sha1_asimd_restore_stack +.return: + ret + + .size mh_sha1_murmur3_block_asimd, .-mh_sha1_murmur3_block_asimd + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 +N1: + .word 0x52dce729 + .word 0x52dce729 + .word 0x52dce729 + .word 0x52dce729 +N2: + .word 0x38495ab5 + .word 0x38495ab5 + .word 0x38495ab5 + .word 0x38495ab5 +C1: + .dword 0x87c37b91114253d5 + .dword 0x87c37b91114253d5 +C2: + .dword 0x4cf5ad432745937f + .dword 0x4cf5ad432745937f diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S new file mode 100644 index 000000000..7f4256e20 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S @@ -0,0 +1,482 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + + + +/* +Variable list +*/ + + declare_var_vector_reg lane0_msg_0, 0 + declare_var_vector_reg lane1_msg_0, 1 + declare_var_vector_reg lane2_msg_0, 2 + declare_var_vector_reg lane3_msg_0, 3 + declare_var_vector_reg lane0_msg_1, 4 + declare_var_vector_reg lane1_msg_1, 5 + declare_var_vector_reg lane2_msg_1, 6 + declare_var_vector_reg lane3_msg_1, 7 + declare_var_vector_reg lane0_msg_2, 8 + declare_var_vector_reg lane1_msg_2, 9 + declare_var_vector_reg lane2_msg_2,10 + declare_var_vector_reg lane3_msg_2,11 + declare_var_vector_reg lane0_msg_3,12 + declare_var_vector_reg lane1_msg_3,13 + declare_var_vector_reg lane2_msg_3,14 + declare_var_vector_reg lane3_msg_3,15 + + declare_var_vector_reg lane0_abcd ,16 + declare_var_vector_reg lane1_abcd ,17 + declare_var_vector_reg lane2_abcd ,18 + declare_var_vector_reg lane3_abcd ,19 + declare_var_vector_reg lane0_tmp0 ,20 + declare_var_vector_reg lane1_tmp0 ,21 + declare_var_vector_reg lane2_tmp0 ,22 + declare_var_vector_reg lane3_tmp0 ,23 + declare_var_vector_reg lane0_tmp1 ,24 + declare_var_vector_reg lane1_tmp1 ,25 + declare_var_vector_reg lane2_tmp1 ,26 + declare_var_vector_reg lane3_tmp1 ,27 + + + declare_var_vector_reg e0 ,28 + declare_var_vector_reg e1 ,29 + declare_var_vector_reg key ,30 + declare_var_vector_reg tmp ,31 + + key_adr .req x5 + msg_adr .req x6 + block_cnt .req x7 + offs .req x8 + mur_n1 .req x9 + mur_n1_w .req w9 + mur_n2 .req x10 + mur_n2_w .req w10 + mur_hash1 .req x11 + mur_hash2 .req x12 + mur_c1 .req x13 + mur_c2 .req x14 + mur_data1 .req x15 + + digest_adr .req x16 + tmp0_adr .req x17 + tmp1_adr .req x18 + mur_data2 .req x19 + mur_data .req x20 + +.macro murmur3_00 + ldp mur_data1, mur_data2, [mur_data], #16 + mul mur_data1, mur_data1, mur_c1 + mul mur_data2, mur_data2, mur_c2 +.endm + +.macro murmur3_01 + /* rotate left by 31 bits */ + ror mur_data1, mur_data1, #64-31 + /* rotate left by 33 bits */ + ror mur_data2, mur_data2, #64-33 + mul mur_data1, mur_data1, mur_c2 + mul mur_data2, mur_data2, mur_c1 +.endm + +.macro murmur3_02 + eor mur_hash1, mur_hash1, mur_data1 + /* rotate left by 27 bits */ + ror mur_hash1, mur_hash1, #64-27 + add mur_hash1, mur_hash1, mur_hash2 + // mur_hash1 = mur_hash1 * 5 + N1 + add mur_hash1, mur_hash1, mur_hash1, LSL #2 + add mur_hash1, mur_n1, mur_hash1 +.endm + +.macro murmur3_03 + eor mur_hash2, mur_hash2, mur_data2 + /* rotate left by 31 bits */ + ror mur_hash2, mur_hash2, #64-31 + add mur_hash2, mur_hash2, mur_hash1 + // mur_hash2 = mur_hash2 * 5 + N2 + add mur_hash2, mur_hash2, mur_hash2, LSL #2 + add mur_hash2, mur_n2, mur_hash2 +.endm + +/** + * maros for round 4-67 + * the code execute 16 times per block, allowing the inserted murmur3 operation to process 256 bytes +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req + sha1h lane0_\tmp0\()_s, lane0_\abcd\()_s + sha1h lane1_\tmp0\()_s, lane1_\abcd\()_s + sha1h lane2_\tmp0\()_s, lane2_\abcd\()_s + sha1h lane3_\tmp0\()_s, lane3_\abcd\()_s + mov \e0\()_v.S[0],lane0_\tmp0\()_v.S[0] + mov \e0\()_v.S[1],lane1_\tmp0\()_v.S[0] + mov \e0\()_v.S[2],lane2_\tmp0\()_v.S[0] + mov \e0\()_v.S[3],lane3_\tmp0\()_v.S[0] + mov lane0_\tmp0\()_v.S[0],\e1\()_v.S[0] + mov lane1_\tmp0\()_v.S[0],\e1\()_v.S[1] + mov lane2_\tmp0\()_v.S[0],\e1\()_v.S[2] + mov lane3_\tmp0\()_v.S[0],\e1\()_v.S[3] + \inst lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s + murmur3_00 + \inst lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s + murmur3_01 + \inst lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s + murmur3_02 + \inst lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s + murmur3_03 + ld1 {lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr] + add lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s + add lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s + add lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s + add lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s + st1 {lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr] + sha1su1 lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s + sha1su1 lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s + sha1su1 lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s + sha1su1 lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s + sha1su0 lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s + sha1su0 lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s + sha1su0 lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s + sha1su0 lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s +.endm + + +/* + * void mh_sha1_murmur3_block_ce (const uint8_t * input_data, + * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + * uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + * uint32_t num_blocks); + * arg 0 pointer to input data + * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) + * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. + * arg 3 pointer to murmur3 digest + * arg 4 number of 1KB blocks + */ + +/* +Arguements list +*/ + input_data .req x0 + digests .req x1 + frame_buffer .req x2 + mur_digest .req x3 + num_blocks .req w4 + + .global mh_sha1_murmur3_block_ce + .type mh_sha1_murmur3_block_ce, %function +mh_sha1_murmur3_block_ce: + // save temp vector registers + stp d8, d9, [sp, -80]! + + stp d10, d11, [sp, 16] + stp d12, d13, [sp, 32] + stp d14, d15, [sp, 48] + stp x19, x20, [sp, 64] + + mov mur_data, input_data + ldr mur_hash1, [mur_digest] + ldr mur_hash2, [mur_digest, 8] + adr mur_c1, C1 + ldr mur_c1, [mur_c1] + adr mur_c2, C2 + ldr mur_c2, [mur_c2] + adr tmp0_adr, N1 + ldr mur_n1_w, [tmp0_adr] + adr tmp0_adr, N2 + ldr mur_n2_w, [tmp0_adr] + + mov tmp0_adr,frame_buffer + add tmp1_adr,tmp0_adr,128 + + +start_loop: + mov block_cnt,0 + mov msg_adr,input_data +lane_loop: + mov offs,64 + adr key_adr,KEY_0 + // load msg 0 + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs + + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs + + add digest_adr,digests,block_cnt + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs + ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs + ldr e0_q,[digest_adr] + + // load key_0 + ldr key_q,[key_adr] + + rev32 lane0_msg_0_v.16b,lane0_msg_0_v.16b + rev32 lane1_msg_0_v.16b,lane1_msg_0_v.16b + rev32 lane2_msg_0_v.16b,lane2_msg_0_v.16b + rev32 lane3_msg_0_v.16b,lane3_msg_0_v.16b + rev32 lane0_msg_1_v.16b,lane0_msg_1_v.16b + rev32 lane1_msg_1_v.16b,lane1_msg_1_v.16b + rev32 lane2_msg_1_v.16b,lane2_msg_1_v.16b + rev32 lane3_msg_1_v.16b,lane3_msg_1_v.16b + rev32 lane0_msg_2_v.16b,lane0_msg_2_v.16b + rev32 lane1_msg_2_v.16b,lane1_msg_2_v.16b + rev32 lane2_msg_2_v.16b,lane2_msg_2_v.16b + rev32 lane3_msg_2_v.16b,lane3_msg_2_v.16b + rev32 lane0_msg_3_v.16b,lane0_msg_3_v.16b + rev32 lane1_msg_3_v.16b,lane1_msg_3_v.16b + rev32 lane2_msg_3_v.16b,lane2_msg_3_v.16b + rev32 lane3_msg_3_v.16b,lane3_msg_3_v.16b + + add lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s + add lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s + add lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s + add lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s + st1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr] + + add lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s + add lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s + add lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s + add lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s + + /* rounds 0-3 */ + sha1h lane0_tmp1_s,lane0_abcd_s + sha1h lane1_tmp1_s,lane1_abcd_s + sha1h lane2_tmp1_s,lane2_abcd_s + sha1h lane3_tmp1_s,lane3_abcd_s + mov e1_v.S[0],lane0_tmp1_v.S[0] + mov e1_v.S[1],lane1_tmp1_v.S[0] + mov e1_v.S[2],lane2_tmp1_v.S[0] + mov e1_v.S[3],lane3_tmp1_v.S[0] + mov lane0_tmp1_v.S[0],e0_v.S[0] + mov lane1_tmp1_v.S[0],e0_v.S[1] + mov lane2_tmp1_v.S[0],e0_v.S[2] + mov lane3_tmp1_v.S[0],e0_v.S[3] + sha1c lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s + sha1c lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s + sha1c lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s + sha1c lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s + ld1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr] + add lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s + sha1su0 lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s + add lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s + sha1su0 lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s + add lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s + sha1su0 lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s + add lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s + sha1su0 lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s + st1 {lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr] + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + + + adr key_adr,KEY_1 + ldr key_q,[key_adr] + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + + adr key_adr,KEY_2 + ldr key_q,[key_adr] + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + + adr key_adr,KEY_3 + ldr key_q,[key_adr] + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0 + + // msg2 and msg1 are free + mov lane0_msg_2_v.S[0],e1_v.S[0] + mov lane1_msg_2_v.S[0],e1_v.S[1] + mov lane2_msg_2_v.S[0],e1_v.S[2] + mov lane3_msg_2_v.S[0],e1_v.S[3] + + /* rounds 68-71 */ + sha1h lane0_msg_1_s,lane0_abcd_s + sha1h lane1_msg_1_s,lane1_abcd_s + sha1h lane2_msg_1_s,lane2_abcd_s + sha1h lane3_msg_1_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s + sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s + sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s + sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s + add lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s + add lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s + add lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s + add lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s + sha1su1 lane0_msg_0_v.4s,lane0_msg_3_v.4s + sha1su1 lane1_msg_0_v.4s,lane1_msg_3_v.4s + sha1su1 lane2_msg_0_v.4s,lane2_msg_3_v.4s + sha1su1 lane3_msg_0_v.4s,lane3_msg_3_v.4s + + /* rounds 72-75 */ + sha1h lane0_msg_2_s,lane0_abcd_s + sha1h lane1_msg_2_s,lane1_abcd_s + sha1h lane2_msg_2_s,lane2_abcd_s + sha1h lane3_msg_2_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s + sha1p lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s + sha1p lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s + sha1p lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s + + /* rounds 76-79 */ + sha1h lane0_msg_1_s,lane0_abcd_s + sha1h lane1_msg_1_s,lane1_abcd_s + sha1h lane2_msg_1_s,lane2_abcd_s + sha1h lane3_msg_1_s,lane3_abcd_s + sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s + sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s + sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s + sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s + add digest_adr,digests,block_cnt + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs + ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs + ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr] + + add lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S + add lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S + add lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S + add lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S + + add lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S + add lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S + add lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S + add lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S + + add digest_adr,digests,block_cnt + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs + st4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs + st4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr] + + add block_cnt,block_cnt,16 + cmp block_cnt,64 + add msg_adr,input_data,block_cnt + add digest_adr,digests,block_cnt + bcc lane_loop + + subs num_blocks,num_blocks,1 + add input_data,input_data,1024 + bhi start_loop + + /* save murmur-hash digest */ + str mur_hash1, [mur_digest], #8 + str mur_hash2, [mur_digest] + +exit_func: + // restore temp register + ldp d10, d11, [sp, 16] + ldp d12, d13, [sp, 32] + ldp d14, d15, [sp, 48] + ldp x19, x20, [sp, 64] + ldp d8, d9, [sp], 80 + ret + + .size mh_sha1_murmur3_block_ce, .-mh_sha1_murmur3_block_ce + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + +N1: + .word 0x52dce729 + .word 0x52dce729 + .word 0x52dce729 + .word 0x52dce729 +N2: + .word 0x38495ab5 + .word 0x38495ab5 + .word 0x38495ab5 + .word 0x38495ab5 + +C1: + .dword 0x87c37b91114253d5 + .dword 0x87c37b91114253d5 +C2: + .dword 0x4cf5ad432745937f + .dword 0x4cf5ad432745937f diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c new file mode 100644 index 000000000..4da674fba --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_murmur3_aarch64_internal.h" + +extern void mh_sha1_tail_ce(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], + uint8_t * frame_buffer, + uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]); + +extern void mh_sha1_block_ce(const uint8_t * input_data, + uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks); + +// mh_sha1_murmur3_update_ce.c +#define UPDATE_FUNCTION mh_sha1_murmur3_update_ce +#define BLOCK_FUNCTION mh_sha1_murmur3_block_ce +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +// mh_sha1_murmur3_finalize_ce.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_ce +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_ce +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S new file mode 100644 index 000000000..051a6157e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S @@ -0,0 +1,34 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + +mbin_interface mh_sha1_murmur3_x64_128_update +mbin_interface mh_sha1_murmur3_x64_128_finalize diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S new file mode 100644 index 000000000..ccc66f41a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S @@ -0,0 +1,271 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor VF.16b, VC.16b, VD.16b + and VF.16b, VB.16b, VF.16b + eor VF.16b, VD.16b, VF.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor VF.16b, VB.16b, VC.16b + eor VF.16b, VF.16b, VD.16b +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and vT0.16b, VB.16b, VC.16b + and vT1.16b, VB.16b, VD.16b + and vT2.16b, VC.16b, VD.16b + orr VF.16b, vT0.16b, vT1.16b + orr VF.16b, VF.16b, vT2.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_x4_word \windex + .endif +.endm + +// FUNC_F0 is merged into STEP_00_15 for efficiency +.macro SHA1_STEP_00_15_F0 windex:req + rev32 WORD\windex\().16b,WORD\windex\().16b + next_word=\windex+1 + load_next_word %next_word + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, VA.4s, 32 - 5 + add VE.4s, VE.4s, VK.4s + sli VT.4s, VA.4s, 5 + eor VF.16b, VC.16b, VD.16b + add VE.4s, VE.4s, WORD\windex\().4s + and VF.16b, VB.16b, VF.16b + add VE.4s, VE.4s, VT.4s + eor VF.16b, VD.16b, VF.16b + ushr VT.4s, VB.4s, 32 - 30 + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor vT0.16b,\reg_3\().16b,\reg_8\().16b + eor VT.16b,\reg_14\().16b,\reg_16\().16b + sha1_step_16_79_interleave0 \windex + eor vT0.16b,vT0.16b,VT.16b + sha1_step_16_79_interleave1 \windex + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, vT0.4s, 32 - 1 + add VE.4s, VE.4s, VK.4s + ushr vT1.4s, VA.4s, 32 - 5 + sli VT.4s, vT0.4s, 1 + add VE.4s, VE.4s, VT.4s + sli vT1.4s, VA.4s, 5 + mov \reg_16\().16b,VT.16b + add VE.4s, VE.4s, vT1.4s + ushr VT.4s, VB.4s, 32 - 30 + \func_f + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + + VA .req v0 + VB .req v1 + VC .req v2 + VD .req v3 + VE .req v4 + VT .req v5 + VF .req v6 + VK .req v7 + WORD0 .req v8 + WORD1 .req v9 + WORD2 .req v10 + WORD3 .req v11 + WORD4 .req v12 + WORD5 .req v13 + WORD6 .req v14 + WORD7 .req v15 + WORD8 .req v16 + WORD9 .req v17 + WORD10 .req v18 + WORD11 .req v19 + WORD12 .req v20 + WORD13 .req v21 + WORD14 .req v22 + WORD15 .req v23 + vT0 .req v24 + vT1 .req v25 + vT2 .req v26 + vAA .req v27 + vBB .req v28 + vCC .req v29 + vDD .req v30 + vEE .req v31 + TT .req v0 + sha1key_adr .req x15 + +.macro SWAP_STATES + // shifted VB is held in VT after each step + .unreq TT + TT .req VE + .unreq VE + VE .req VD + .unreq VD + VD .req VC + .unreq VC + VC .req VT + .unreq VT + VT .req VB + .unreq VB + VB .req VA + .unreq VA + VA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15_F0 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES + + .if \windex == 79 + // after 80 steps, the registers ABCDET has shifted from + // its orignal order of 012345 to 341520 + // have to swap back for both compile- and run-time correctness + mov v0.16b,v3.16b + .unreq VA + VA .req v0 + + mov vT0.16b,v2.16b + mov v2.16b,v1.16b + mov v1.16b,v4.16b + .unreq VB + VB .req v1 + .unreq VC + VC .req v2 + + mov v3.16b,v5.16b + .unreq VD + VD .req v3 + + mov v4.16b,vT0.16b + .unreq VE + VE .req v4 + + .unreq VT + VT .req v5 + .endif +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.macro sha1_single + load_x4_word 0 + + mov vAA.16B, VA.16B + mov vBB.16B, VB.16B + mov vCC.16B, VC.16B + mov vDD.16B, VD.16B + mov vEE.16B, VE.16B + + adr sha1key_adr, KEY_0 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add VA.4s, vAA.4s, VA.4s + add VB.4s, vBB.4s, VB.4s + add VC.4s, vCC.4s, VC.4s + add VD.4s, vDD.4s, VD.4s + add VE.4s, vEE.4s, VE.4s +.endm + +.macro sha1_asimd_save_stack + stp d8,d9,[sp, -64]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] + stp d14,d15,[sp, 48] +.endm + +.macro sha1_asimd_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d14,d15,[sp, 48] + ldp d8,d9,[sp],64 +.endm diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c new file mode 100644 index 000000000..518adb797 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c @@ -0,0 +1,154 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_murmur3_x64_128_internal.h" + +int mh_sha1_murmur3_x64_128_init(struct mh_sha1_murmur3_x64_128_ctx *ctx, uint64_t murmur_seed) +{ + uint64_t *murmur3_x64_128_hash; + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint32_t i; + + if (ctx == NULL) + return MH_SHA1_MURMUR3_CTX_ERROR_NULL; + + memset(ctx, 0, sizeof(*ctx)); + + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + for (i = 0; i < HASH_SEGS; i++) { + mh_sha1_segs_digests[0][i] = MH_SHA1_H0; + mh_sha1_segs_digests[1][i] = MH_SHA1_H1; + mh_sha1_segs_digests[2][i] = MH_SHA1_H2; + mh_sha1_segs_digests[3][i] = MH_SHA1_H3; + mh_sha1_segs_digests[4][i] = MH_SHA1_H4; + } + + murmur3_x64_128_hash = (uint64_t *) ctx->murmur3_x64_128_digest; + murmur3_x64_128_hash[0] = murmur_seed; + murmur3_x64_128_hash[1] = murmur_seed; + + return MH_SHA1_MURMUR3_CTX_ERROR_NONE; +} + +void mh_sha1_murmur3_x64_128_block_base(const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t + murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks) +{ + + mh_sha1_block_base(input_data, mh_sha1_digests, frame_buffer, num_blocks); + + murmur3_x64_128_block(input_data, + num_blocks * MH_SHA1_BLOCK_SIZE / MUR_BLOCK_SIZE, + murmur3_x64_128_digests); + + return; +} + +#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \ + || defined( _M_X64) || defined(_M_IX86)) +/***************mh_sha1_murmur3_x64_128_update***********/ +// mh_sha1_murmur3_x64_128_update_sse.c +#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_sse +#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_sse +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +// mh_sha1_murmur3_x64_128_update_avx.c +#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx +#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +// mh_sha1_murmur3_x64_128_update_avx2.c +#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx2 +#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx2 +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +/***************mh_sha1_murmur3_x64_128_finalize***********/ +// mh_sha1_murmur3_x64_128_finalize_sse.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_sse +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION + +// mh_sha1_murmur3_x64_128_finalize_avx.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION + +// mh_sha1_murmur3_x64_128_finalize_avx2.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx2 +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2 +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION + +/***************version info***********/ + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha1_murmur3_x64_128_init_slver_00000251; +struct slver mh_sha1_murmur3_x64_128_init_slver = { 0x0251, 0x00, 0x00 }; + +// mh_sha1_murmur3_x64_128_update version info +struct slver mh_sha1_murmur3_x64_128_update_sse_slver_00000254; +struct slver mh_sha1_murmur3_x64_128_update_sse_slver = { 0x0254, 0x00, 0x00 }; + +struct slver mh_sha1_murmur3_x64_128_update_avx_slver_02000256; +struct slver mh_sha1_murmur3_x64_128_update_avx_slver = { 0x0256, 0x00, 0x02 }; + +struct slver mh_sha1_murmur3_x64_128_update_avx2_slver_04000258; +struct slver mh_sha1_murmur3_x64_128_update_avx2_slver = { 0x0258, 0x00, 0x04 }; + +// mh_sha1_murmur3_x64_128_finalize version info +struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver_00000255; +struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver = { 0x0255, 0x00, 0x00 }; + +struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver_02000257; +struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver = { 0x0257, 0x00, 0x02 }; + +struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver_04000259; +struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver = { 0x0259, 0x00, 0x04 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c new file mode 100644 index 000000000..fbef1ac13 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c @@ -0,0 +1,67 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha1_murmur3_x64_128_internal.h" + +#ifdef HAVE_AS_KNOWS_AVX512 + +/***************mh_sha1_murmur3_x64_128_update***********/ +// mh_sha1_murmur3_x64_128_update_avx512.c +#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx512 +#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx512 +#include "mh_sha1_murmur3_x64_128_update_base.c" +#undef UPDATE_FUNCTION +#undef BLOCK_FUNCTION + +/***************mh_sha1_murmur3_x64_128_finalize***********/ +// mh_sha1_murmur3_x64_128_finalize_avx512.c +#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx512 +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512 +#include "mh_sha1_murmur3_x64_128_finalize_base.c" +#undef FINALIZE_FUNCTION +#undef MH_SHA1_TAIL_FUNCTION + +/***************version info***********/ + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// mh_sha1_murmur3_x64_128_update version info +struct slver mh_sha1_murmur3_x64_128_update_avx512_slver_0600025c; +struct slver mh_sha1_murmur3_x64_128_update_avx512_slver = { 0x025c, 0x00, 0x06 }; + +// mh_sha1_murmur3_x64_128_finalize version info +struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver_0600025d; +struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver = { 0x025d, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c new file mode 100644 index 000000000..28f15086d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c @@ -0,0 +1,43 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "mh_sha1_murmur3_x64_128_internal.h" +#include +int mh_sha1_murmur3_x64_128_update(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer, + uint32_t len) +{ + return mh_sha1_murmur3_x64_128_update_base(ctx, buffer, len); + +} + +int mh_sha1_murmur3_x64_128_finalize(struct mh_sha1_murmur3_x64_128_ctx *ctx, + void *mh_sha1_digest, void *murmur3_x64_128_digest) +{ + return mh_sha1_murmur3_x64_128_finalize_base(ctx, mh_sha1_digest, + murmur3_x64_128_digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm new file mode 100644 index 000000000..4611494e0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm @@ -0,0 +1,706 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[%%data + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;; Insert murmur's instructions into this macro. +;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte. +;; So insert 1 murmur block into every 4 SHA1_STEP_16_79. +%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J + +%macro SHA1_STEP_16_79_0 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + mov mur_data1, [mur_in_p] + mov mur_data2, [mur_in_p + 8] + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + imul mur_data1, mur_c1_r + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + imul mur_data2, mur_c2_r + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_1 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + rol mur_data1, R1 + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + rol mur_data2, R2 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + imul mur_data1, mur_c2_r + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + imul mur_data2, mur_c1_r + PROLD %%regB,30, %%regT + add mur_in_p, 16 + vpaddd %%regE,%%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_2 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + xor mur_hash1, mur_data1 + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + rol mur_hash1, R3 + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + add mur_hash1, mur_hash2 + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1] + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_3 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + xor mur_hash2, mur_data2 + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16] + rol mur_hash2, R4 + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + add mur_hash2, mur_hash1 + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2] + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8d + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + %endmacro + %macro FUNC_RESTORE 0 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10d + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define PS 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + save_reg rbx, 10*16 + 6*8 + save_reg rbp, 10*16 + 7*8 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbx, [rsp + 10*16 + 6*8] + mov rbp, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg4 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables of murmur3 +%define mur_in_p tmp2 +%define mur_digest_p arg3 +%define mur_hash1 tmp3 +%define mur_hash2 tmp4 +%define mur_data1 tmp5 +%define mur_data2 return +%define mur_c1_r tmp6 +%define mur_c2_r arg5 +; constants of murmur3_x64_128 +%define R1 31 +%define R2 33 +%define R3 27 +%define R4 31 +%define M 5 +%define N1 0x52dce729;DWORD +%define N2 0x38495ab5;DWORD +%define C1 QWORD(0x87c37b91114253d5) +%define C2 QWORD(0x4cf5ad432745937f) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp7 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep | + +align 32 +;void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data, +; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], +; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], +; uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 pointer to murmur3 digest +; arg 4 number of 1KB blocks +; +mk_global mh_sha1_murmur3_x64_128_block_avx, function, internal +func(mh_sha1_murmur3_x64_128_block_avx) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by avx + and rsp, ~0x0F + + %assign I 0 ; copy segs_digests into stack + %rep 5 + VMOVPS A, [mh_digests_p + I*64 + 16*0] + VMOVPS B, [mh_digests_p + I*64 + 16*1] + VMOVPS C, [mh_digests_p + I*64 + 16*2] + VMOVPS D, [mh_digests_p + I*64 + 16*3] + + vmovdqa [rsp + I*64 + 16*0], A + vmovdqa [rsp + I*64 + 16*1], B + vmovdqa [rsp + I*64 + 16*2], C + vmovdqa [rsp + I*64 + 16*3], D + %assign I (I+1) + %endrep + + ;init murmur variables + mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1 + ;load murmur hash digests and multiplier + mov mur_hash1, [mur_digest_p] + mov mur_hash2, [mur_digest_p + 8] + mov mur_c1_r, C1 + mov mur_c2_r, C2 + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4 + %assign I 0 + %rep 16 + VMOVPS T0,[mh_in_p + I*64+0*16] + VMOVPS T1,[mh_in_p + I*64+1*16] + VMOVPS T2,[mh_in_p + I*64+2*16] + VMOVPS T3,[mh_in_p + I*64+3*16] + + vpshufb T0, F + vmovdqa [mh_data_p +(I)*16 +0*256],T0 + vpshufb T1, F + vmovdqa [mh_data_p +(I)*16 +1*256],T1 + vpshufb T2, F + vmovdqa [mh_data_p +(I)*16 +2*256],T2 + vpshufb T3, F + vmovdqa [mh_data_p +(I)*16 +3*256],T3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + .segs_loop: + ;; Initialize digests + vmovdqa A, [rsp + 0*64 + mh_segs] + vmovdqa B, [rsp + 1*64 + mh_segs] + vmovdqa C, [rsp + 2*64 + mh_segs] + vmovdqa D, [rsp + 3*64 + mh_segs] + vmovdqa E, [rsp + 4*64 + mh_segs] + + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E +;; +;; perform 0-79 steps +;; + vmovdqa K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 16...19 + vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16] + vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16] + %rep 4 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 20...39 + vmovdqa K, [K20_39] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 40...59 + vmovdqa K, [K40_59] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 60...79 + vmovdqa K, [K60_79] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + vpaddd A, AA + vpaddd B, BB + vpaddd C, CC + vpaddd D, DD + vpaddd E, EE + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], A + vmovdqa [rsp + 1*64 + mh_segs], B + vmovdqa [rsp + 2*64 + mh_segs], C + vmovdqa [rsp + 3*64 + mh_segs], D + vmovdqa [rsp + 4*64 + mh_segs], E + + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + ;store murmur-hash digest + mov [mur_digest_p], mur_hash1 + mov [mur_digest_p + 8], mur_hash2 + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 5 + vmovdqa A, [rsp + I*64 + 16*0] + vmovdqa B, [rsp + I*64 + 16*1] + vmovdqa C, [rsp + I*64 + 16*2] + vmovdqa D, [rsp + I*64 + 16*3] + + VMOVPS [mh_digests_p + I*64 + 16*0], A + VMOVPS [mh_digests_p + I*64 + 16*1], B + VMOVPS [mh_digests_p + I*64 + 16*2], C + VMOVPS [mh_digests_p + I*64 + 16*3], D + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm new file mode 100644 index 000000000..3fb440bf1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm @@ -0,0 +1,653 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX2 +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + + + +;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-%%imm) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[%%data + (%%memW * 32)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;; Insert murmur's instructions into this macro. +;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte. +;; So insert 1 murmur block into every 2 SHA1_STEP_16_79. +%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J + +%macro SHA1_STEP_16_79_0 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32] + mov mur_data1, [mur_in_p] + mov mur_data2, [mur_in_p + 8] + + vpsrld %%regF, W16, (32-1) + imul mur_data1, mur_c1_r + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + imul mur_data2, mur_c2_r + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF + rol mur_data1, R1 + vpaddd %%regE, %%regE,%%regF + rol mur_data2, R2 + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + imul mur_data1, mur_c2_r + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + imul mur_data2, mur_c1_r + vpaddd %%regE,%%regE,%%regF +%endmacro + + +%macro SHA1_STEP_16_79_1 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + vpaddd %%regE, %%regE,%%immCNT + xor mur_hash1, mur_data1 + vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32] + rol mur_hash1, R3 + vpxor W16, W16, W14 + add mur_hash1, mur_hash2 + vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32] + lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1] + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + xor mur_hash2, mur_data2 + vpor %%regF, %%regF, W16 + rol mur_hash2, R4 + ROTATE_W + + vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + add mur_hash2, mur_hash1 + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2] + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + add mur_in_p, 16 + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8d + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + %endmacro + %macro FUNC_RESTORE 0 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10d + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define PS 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + save_reg rbx, 10*16 + 6*8 + save_reg rbp, 10*16 + 7*8 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbx, [rsp + 10*16 + 6*8] + mov rbp, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg4 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables of murmur3 +%define mur_in_p tmp2 +%define mur_digest_p arg3 +%define mur_hash1 tmp3 +%define mur_hash2 tmp4 +%define mur_data1 tmp5 +%define mur_data2 return +%define mur_c1_r tmp6 +%define mur_c2_r arg5 +; constants of murmur3_x64_128 +%define R1 31 +%define R2 33 +%define R3 27 +%define R4 31 +%define M 5 +%define N1 0x52dce729;DWORD +%define N2 0x38495ab5;DWORD +%define C1 QWORD(0x87c37b91114253d5) +%define C2 QWORD(0x4cf5ad432745937f) +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp7 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS + +%define pref tmp8 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 + +%define F ymm5 +%define T0 ymm6 +%define T1 ymm7 +%define T2 ymm8 +%define T3 ymm9 +%define T4 ymm10 +%define T5 ymm11 +%define T6 ymm12 +%define T7 ymm13 +%define T8 ymm14 +%define T9 ymm15 + +%define AA ymm5 +%define BB ymm6 +%define CC ymm7 +%define DD ymm8 +%define EE ymm9 +%define TMP ymm10 +%define FUN ymm11 +%define K ymm12 +%define W14 ymm13 +%define W15 ymm14 +%define W16 ymm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep | + +align 32 +;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data, +; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], +; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], +; uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 pointer to murmur3 digest +; arg 4 number of 1KB blocks +; +mk_global mh_sha1_murmur3_x64_128_block_avx2, function, internal +func(mh_sha1_murmur3_x64_128_block_avx2) + endbranch + FUNC_SAVE + + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 32 Bytes needed by avx2 + and rsp, ~0x1F + + %assign I 0 ; copy segs_digests into stack + %rep 2 + VMOVPS A, [mh_digests_p + I*32*5 + 32*0] + VMOVPS B, [mh_digests_p + I*32*5 + 32*1] + VMOVPS C, [mh_digests_p + I*32*5 + 32*2] + VMOVPS D, [mh_digests_p + I*32*5 + 32*3] + VMOVPS E, [mh_digests_p + I*32*5 + 32*4] + + vmovdqa [rsp + I*32*5 + 32*0], A + vmovdqa [rsp + I*32*5 + 32*1], B + vmovdqa [rsp + I*32*5 + 32*2], C + vmovdqa [rsp + I*32*5 + 32*3], D + vmovdqa [rsp + I*32*5 + 32*4], E + %assign I (I+1) + %endrep + + ;init murmur variables + mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1 + ;load murmur hash digests and multiplier + mov mur_hash1, [mur_digest_p] + mov mur_hash2, [mur_digest_p + 8] + mov mur_c1_r, C1 + mov mur_c2_r, C2 + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2 +%assign I 0 +%rep 16 + VMOVPS T0,[mh_in_p + I*64+0*32] + VMOVPS T1,[mh_in_p + I*64+1*32] + + vpshufb T0, T0, F + vmovdqa [mh_data_p +I*32+0*512],T0 + vpshufb T1, T1, F + vmovdqa [mh_data_p +I*32+1*512],T1 +%assign I (I+1) +%endrep + + mov mh_segs, 0 ;start from the first 8 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + ;; Initialize digests + vmovdqa A, [rsp + 0*64 + mh_segs] + vmovdqa B, [rsp + 1*64 + mh_segs] + vmovdqa C, [rsp + 2*64 + mh_segs] + vmovdqa D, [rsp + 3*64 + mh_segs] + vmovdqa E, [rsp + 4*64 + mh_segs] + + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E +;; +;; perform 0-79 steps +;; + vmovdqa K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32] + vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32] + %rep 4 + %assign J (I % 2) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + PREFETCH_X [mh_in_p + pref+128*1] +;; do rounds 20...39 + vmovdqa K, [K20_39] + %rep 20 + %assign J (I % 2) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep +;; do rounds 40...59 + vmovdqa K, [K40_59] + %rep 20 + %assign J (I % 2) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*2] + PREFETCH_X [mh_in_p + pref+128*3] +;; do rounds 60...79 + vmovdqa K, [K60_79] + %rep 20 + %assign J (I % 2) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + vpaddd A,A, AA + vpaddd B,B, BB + vpaddd C,C, CC + vpaddd D,D, DD + vpaddd E,E, EE + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], A + vmovdqa [rsp + 1*64 + mh_segs], B + vmovdqa [rsp + 2*64 + mh_segs], C + vmovdqa [rsp + 3*64 + mh_segs], D + vmovdqa [rsp + 4*64 + mh_segs], E + + add pref, 512 + + add mh_data_p, 512 + add mh_segs, 32 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + ;store murmur-hash digest + mov [mur_digest_p], mur_hash1 + mov [mur_digest_p + 8], mur_hash2 + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 2 + vmovdqa A, [rsp + I*32*5 + 32*0] + vmovdqa B, [rsp + I*32*5 + 32*1] + vmovdqa C, [rsp + I*32*5 + 32*2] + vmovdqa D, [rsp + I*32*5 + 32*3] + vmovdqa E, [rsp + I*32*5 + 32*4] + + VMOVPS [mh_digests_p + I*32*5 + 32*0], A + VMOVPS [mh_digests_p + I*32*5 + 32*1], B + VMOVPS [mh_digests_p + I*32*5 + 32*2], C + VMOVPS [mh_digests_p + I*32*5 + 32*3], D + VMOVPS [mh_digests_p + I*32*5 + 32*4], E + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=32 + +align 32 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm new file mode 100644 index 000000000..a5c157078 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm @@ -0,0 +1,504 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using AVX-512 +;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovdqu64 +;SIMD variables definition +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define HH0 zmm5 +%define HH1 zmm6 +%define HH2 zmm7 +%define HH3 zmm8 +%define HH4 zmm9 +%define KT zmm10 +%define XTMP0 zmm11 +%define XTMP1 zmm12 +%define SHUF_MASK zmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;using extra 16 ZMM registers to place the inverse input data +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;macros definition +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%F_IMMED %2 + + ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt + ; E=D, D=C, C=ROTL_30(B), B=A, A=T + + ; Ft + ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) + ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D + ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) + + vmovdqa32 XTMP1, B ; Copy B + vpaddd E, E, %%WT ; E = E + Wt + vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) + vpaddd E, E, KT ; E = E + Wt + Kt + vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A) + vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt + vprold B, B, 30 ; B = ROTL_30(B) + vpaddd E, E, XTMP0 ; E = T + + ROTATE_ARGS +%endmacro + +;; Insert murmur's instructions into this macro. +;; Every section_loop of mh_sha1 calls PROCESS_LOOP 80 and +;; MSG_SCHED_ROUND_16_79 64 times and processes 1024 Bytes. +;; So insert 1 murmur block per section_loop. +%macro PROCESS_LOOP_MUR 2 +%define %%WT %1 +%define %%F_IMMED %2 + + ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt + ; E=D, D=C, C=ROTL_30(B), B=A, A=T + + ; Ft + ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) + ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D + ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) + + mov mur_data1, [mur_in_p] + mov mur_data2, [mur_in_p + 8] + vmovdqa32 XTMP1, B ; Copy B + imul mur_data1, mur_c1_r + imul mur_data2, mur_c2_r + vpaddd E, E, %%WT ; E = E + Wt + rol mur_data1, R1 + rol mur_data2, R2 + vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) + imul mur_data1, mur_c2_r + imul mur_data2, mur_c1_r + vpaddd E, E, KT ; E = E + Wt + Kt + xor mur_hash1, mur_data1 + add mur_in_p, 16 + vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A) + rol mur_hash1, R3 + vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt + add mur_hash1, mur_hash2 + vprold B, B, 30 ; B = ROTL_30(B) + lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1] + vpaddd E, E, XTMP0 ; E = T + xor mur_hash2, mur_data2 + + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79_MUR 4 +%define %%WT %1 +%define %%WTp2 %2 +%define %%WTp8 %3 +%define %%WTp13 %4 + ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) + ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) + vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 + rol mur_hash2, R4 + vpxord %%WT, %%WT, %%WTp13 + add mur_hash2, mur_hash1 + lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2] + vprold %%WT, %%WT, 1 +%endmacro + +%define APPEND(a,b) a %+ b +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8d + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + %endmacro + %macro FUNC_RESTORE 0 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10d + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define PS 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + ; remove unwind info macros + %define func(x) x: + %macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp + 0*16], xmm6 + movdqa [rsp + 1*16], xmm7 + movdqa [rsp + 2*16], xmm8 + movdqa [rsp + 3*16], xmm9 + movdqa [rsp + 4*16], xmm10 + movdqa [rsp + 5*16], xmm11 + movdqa [rsp + 6*16], xmm12 + movdqa [rsp + 7*16], xmm13 + movdqa [rsp + 8*16], xmm14 + movdqa [rsp + 9*16], xmm15 + mov [rsp + 10*16 + 0*8], r12 + mov [rsp + 10*16 + 1*8], r13 + mov [rsp + 10*16 + 2*8], r14 + mov [rsp + 10*16 + 3*8], r15 + mov [rsp + 10*16 + 4*8], rdi + mov [rsp + 10*16 + 5*8], rsi + mov [rsp + 10*16 + 6*8], rbx + mov [rsp + 10*16 + 7*8], rbp + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbx, [rsp + 10*16 + 6*8] + mov rbp, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg4 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables of murmur3 +%define mur_in_p tmp2 +%define mur_digest_p arg3 +%define mur_hash1 tmp3 +%define mur_hash2 tmp4 +%define mur_data1 tmp5 +%define mur_data2 return +%define mur_c1_r tmp6 +%define mur_c2_r arg5 +; constants of murmur3_x64_128 +%define R1 31 +%define R2 33 +%define R3 27 +%define R4 31 +%define M 5 +%define N1 0x52dce729;DWORD +%define N2 0x38495ab5;DWORD +%define C1 QWORD(0x87c37b91114253d5) +%define C2 QWORD(0x4cf5ad432745937f) +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp7 + +%define pref tmp8 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep | + +[bits 64] +section .text +align 32 + +;void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data, +; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], +; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], +; uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 pointer to murmur3 digest +; arg 4 number of 1KB blocks +; +global mh_sha1_murmur3_x64_128_block_avx512 +func(mh_sha1_murmur3_x64_128_block_avx512) + endbranch + FUNC_SAVE + + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; align rsp to 64 Bytes needed by avx512 + and rsp, ~0x3f + + ; copy segs_digests into registers. + VMOVPS HH0, [mh_digests_p + 64*0] + VMOVPS HH1, [mh_digests_p + 64*1] + VMOVPS HH2, [mh_digests_p + 64*2] + VMOVPS HH3, [mh_digests_p + 64*3] + VMOVPS HH4, [mh_digests_p + 64*4] + ;a mask used to transform to big-endian data + vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK] + + ;init murmur variables + mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1 + ;load murmur hash digests and multiplier + mov mur_hash1, [mur_digest_p] + mov mur_hash2, [mur_digest_p + 8] + mov mur_c1_r, C1 + mov mur_c2_r, C2 + +.block_loop: + ;transform to big-endian data and store on aligned_frame + ;using extra 16 ZMM registers instead of stack +%assign I 0 +%rep 8 +%assign J (I+1) + VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64] + VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64] + + vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK + vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK +%assign I (I+2) +%endrep + + vmovdqa64 A, HH0 + vmovdqa64 B, HH1 + vmovdqa64 C, HH2 + vmovdqa64 D, HH3 + vmovdqa64 E, HH4 + + vmovdqa32 KT, [K00_19] +%assign I 0xCA +%assign J 0 +%assign K 2 +%assign L 8 +%assign M 13 +%assign N 0 +%rep 80 + %if N < 64 ; stitching 64 times + PROCESS_LOOP_MUR APPEND(W,J), I + MSG_SCHED_ROUND_16_79_MUR APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %else ; 64 <= N < 80, without stitching + PROCESS_LOOP APPEND(W,J), I + %endif + %if N = 19 + vmovdqa32 KT, [K20_39] + %assign I 0x96 + %elif N = 39 + vmovdqa32 KT, [K40_59] + %assign I 0xE8 + %elif N = 59 + vmovdqa32 KT, [K60_79] + %assign I 0x96 + %endif + %if N % 20 = 19 + PREFETCH_X [mh_in_p + 1024+128*(N / 20)] + PREFETCH_X [mh_in_p + 1024+128*(N / 20 +1)] + %endif +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%assign N (N+1) +%endrep + + ; Add old digest + vpaddd HH0,A, HH0 + vpaddd HH1,B, HH1 + vpaddd HH2,C, HH2 + vpaddd HH3,D, HH3 + vpaddd HH4,E, HH4 + + add mh_in_p, 1024 + sub loops, 1 + jne .block_loop + + ;store murmur-hash digest + mov [mur_digest_p], mur_hash1 + mov [mur_digest_p + 8], mur_hash2 + + ; copy segs_digests to mh_digests_p + VMOVPS [mh_digests_p + 64*0], HH0 + VMOVPS [mh_digests_p + 64*1], HH1 + VMOVPS [mh_digests_p + 64*2], HH2 + VMOVPS [mh_digests_p + 64*3], HH3 + VMOVPS [mh_digests_p + 64*4], HH4 + + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + + +section .data align=64 + +align 64 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + dq 0x5A8279995A827999 + +K20_39: dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1 + +K40_59: dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC + +K60_79: dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_murmur3_x64_128_block_avx512 +no_sha1_murmur3_x64_128_block_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm new file mode 100644 index 000000000..ebd1b8b49 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm @@ -0,0 +1,702 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA1 using SSE +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + paddd %%regE,[%%data + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro SHA1_STEP_16_79 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;; Insert murmur's instructions into this macro. +;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte. +;; So insert 1 murmur block into every 4 SHA1_STEP_16_79. +%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J + +%macro SHA1_STEP_16_79_0 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + + paddd %%regE,%%immCNT + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + mov mur_data1, [mur_in_p] + mov mur_data2, [mur_in_p + 8] + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + imul mur_data1, mur_c1_r + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + imul mur_data2, mur_c2_r + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_1 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + rol mur_data1, R1 + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + rol mur_data2, R2 + psrld %%regF, (32-1) + por %%regF, W16 + + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + imul mur_data1, mur_c2_r + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + imul mur_data2, mur_c1_r + PROLD %%regB,30, %%regT + add mur_in_p, 16 + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_2 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + xor mur_hash1, mur_data1 + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + rol mur_hash1, R3 + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + add mur_hash1, mur_hash2 + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1] + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79_3 11 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 +%define %%data %11 + paddd %%regE,%%immCNT + movdqa W14, [%%data + ((%%memW - 14) & 15) * 16] + xor mur_hash2, mur_data2 + pxor W16, W14 + pxor W16, [%%data + ((%%memW - 8) & 15) * 16] + pxor W16, [%%data + ((%%memW - 3) & 15) * 16] + rol mur_hash2, R4 + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + + ROTATE_W + + movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF + add mur_hash2, mur_hash1 + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2] + paddd %%regE,%%regF +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8d + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + %endmacro + %macro FUNC_RESTORE 0 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10d + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define tmp7 rbx ; must be saved and restored + %define tmp8 rbp ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 + %define PS 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + save_reg rbx, 10*16 + 6*8 + save_reg rbp, 10*16 + 7*8 + end_prolog + mov arg4, arg(4) + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + mov rbx, [rsp + 10*16 + 6*8] + mov rbp, [rsp + 10*16 + 7*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg4 +;variables of mh_sha1 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables of murmur3 +%define mur_in_p tmp2 +%define mur_digest_p arg3 +%define mur_hash1 tmp3 +%define mur_hash2 tmp4 +%define mur_data1 tmp5 +%define mur_data2 return +%define mur_c1_r tmp6 +%define mur_c2_r arg5 +; constants of murmur3_x64_128 +%define R1 31 +%define R2 33 +%define R3 27 +%define R4 31 +%define M 5 +%define N1 0x52dce729;DWORD +%define N2 0x38495ab5;DWORD +%define C1 QWORD(0x87c37b91114253d5) +%define C2 QWORD(0x4cf5ad432745937f) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp7 +%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define MOVPS movups + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep | + +align 32 +;void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data, +; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], +; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], +; uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 pointer to murmur3 digest +; arg 4 number of 1KB blocks +; +mk_global mh_sha1_murmur3_x64_128_block_sse, function, internal +func(mh_sha1_murmur3_x64_128_block_sse) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by sse + and rsp, ~0x0F + + %assign I 0 ; copy segs_digests into stack + %rep 5 + MOVPS A, [mh_digests_p + I*64 + 16*0] + MOVPS B, [mh_digests_p + I*64 + 16*1] + MOVPS C, [mh_digests_p + I*64 + 16*2] + MOVPS D, [mh_digests_p + I*64 + 16*3] + + movdqa [rsp + I*64 + 16*0], A + movdqa [rsp + I*64 + 16*1], B + movdqa [rsp + I*64 + 16*2], C + movdqa [rsp + I*64 + 16*3], D + %assign I (I+1) + %endrep + + ;init murmur variables + mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1 + ;load murmur hash digests and multiplier + mov mur_hash1, [mur_digest_p] + mov mur_hash2, [mur_digest_p + 8] + mov mur_c1_r, C1 + mov mur_c2_r, C2 + +.block_loop: + ;transform to big-endian data and store on aligned_frame + movdqa F, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4 + %assign I 0 + %rep 16 + MOVPS T0,[mh_in_p+I*64+0*16] + MOVPS T1,[mh_in_p+I*64+1*16] + MOVPS T2,[mh_in_p+I*64+2*16] + MOVPS T3,[mh_in_p+I*64+3*16] + + pshufb T0, F + movdqa [mh_data_p+(I)*16 +0*256],T0 + pshufb T1, F + movdqa [mh_data_p+(I)*16 +1*256],T1 + pshufb T2, F + movdqa [mh_data_p+(I)*16 +2*256],T2 + pshufb T3, F + movdqa [mh_data_p+(I)*16 +3*256],T3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + .segs_loop: + ;; Initialize digests + movdqa A, [rsp + 0*64 + mh_segs] + movdqa B, [rsp + 1*64 + mh_segs] + movdqa C, [rsp + 2*64 + mh_segs] + movdqa D, [rsp + 3*64 + mh_segs] + movdqa E, [rsp + 4*64 + mh_segs] + + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E +;; +;; perform 0-79 steps +;; + movdqa K, [K00_19] +;; do rounds 0...15 + %assign I 0 + %rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 16...19 + movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16] + movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16] + %rep 4 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 20...39 + movdqa K, [K20_39] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 40...59 + movdqa K, [K40_59] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + +;; do rounds 60...79 + movdqa K, [K60_79] + %rep 20 + %assign J (I % 4) + SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p + ROTATE_ARGS + %assign I (I+1) + %endrep + + paddd A, AA + paddd B, BB + paddd C, CC + paddd D, DD + paddd E, EE + + ; write out digests + movdqa [rsp + 0*64 + mh_segs], A + movdqa [rsp + 1*64 + mh_segs], B + movdqa [rsp + 2*64 + mh_segs], C + movdqa [rsp + 3*64 + mh_segs], D + movdqa [rsp + 4*64 + mh_segs], E + + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + ;store murmur-hash digest + mov [mur_digest_p], mur_hash1 + mov [mur_digest_p + 8], mur_hash2 + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 5 + movdqa A, [rsp + I*64 + 16*0] + movdqa B, [rsp + I*64 + 16*1] + movdqa C, [rsp + I*64 + 16*2] + movdqa D, [rsp + I*64 + 16*3] + + MOVPS [mh_digests_p + I*64 + 16*0], A + MOVPS [mh_digests_p + I*64 + 16*1], B + MOVPS [mh_digests_p + I*64 + 16*2], C + MOVPS [mh_digests_p + I*64 + 16*3], D + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c new file mode 100644 index 000000000..4d09abf1d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c @@ -0,0 +1,102 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef FINALIZE_FUNCTION +#include // For NULL +#include "mh_sha1_murmur3_x64_128_internal.h" + +#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_base +#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base +#define FINALIZE_FUNCTION_SLVER +#endif + +#define MURMUR_BLOCK_FUNCTION murmur3_x64_128_block +#define MURMUR_TAIL_FUNCTION murmur3_x64_128_tail + +int FINALIZE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, void *mh_sha1_digest, + void *murmur3_x64_128_digest) +{ + uint8_t *partial_block_buffer, *murmur_tail_data; + uint64_t partial_block_len, total_len; + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + + if (ctx == NULL) + return MH_SHA1_MURMUR3_CTX_ERROR_NULL; + + total_len = ctx->total_length; + partial_block_len = total_len % MH_SHA1_BLOCK_SIZE; + partial_block_buffer = ctx->partial_block_buffer; + + // Calculate murmur3 firstly + // because mh_sha1 will change the partial_block_buffer + // ( partial_block_buffer = n murmur3 blocks and 1 murmur3 tail) + murmur_tail_data = + partial_block_buffer + partial_block_len - partial_block_len % MUR_BLOCK_SIZE; + MURMUR_BLOCK_FUNCTION(partial_block_buffer, partial_block_len / MUR_BLOCK_SIZE, + ctx->murmur3_x64_128_digest); + MURMUR_TAIL_FUNCTION(murmur_tail_data, total_len, ctx->murmur3_x64_128_digest); + + /* mh_sha1 final */ + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + + MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests, + aligned_frame_buffer, ctx->mh_sha1_digest); + + /* Output the digests of murmur3 and mh_sha1 */ + if (mh_sha1_digest != NULL) { + ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0]; + ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1]; + ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2]; + ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3]; + ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4]; + } + + if (murmur3_x64_128_digest != NULL) { + ((uint32_t *) murmur3_x64_128_digest)[0] = ctx->murmur3_x64_128_digest[0]; + ((uint32_t *) murmur3_x64_128_digest)[1] = ctx->murmur3_x64_128_digest[1]; + ((uint32_t *) murmur3_x64_128_digest)[2] = ctx->murmur3_x64_128_digest[2]; + ((uint32_t *) murmur3_x64_128_digest)[3] = ctx->murmur3_x64_128_digest[3]; + } + + return MH_SHA1_MURMUR3_CTX_ERROR_NONE; +} + +#ifdef FINALIZE_FUNCTION_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + + // Version info +struct slver mh_sha1_murmur3_x64_128_finalize_base_slver_0000025b; +struct slver mh_sha1_murmur3_x64_128_finalize_base_slver = { 0x025b, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h new file mode 100644 index 000000000..e77837347 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h @@ -0,0 +1,202 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_ +#define _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_ + +/** + * @file mh_sha1_murmur3_x64_128_internal.h + * @brief mh_sha1_murmur3_x64_128 internal function prototypes and macros + * + * Interface for mh_sha1_murmur3_x64_128 internal functions + * + */ +#include +#include "mh_sha1_internal.h" +#include "mh_sha1_murmur3_x64_128.h" + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef _MSC_VER +# define inline __inline +#endif + + /******************************************************************* + * mh_sha1_murmur3_x64_128 API internal function prototypes + * Multiple versions of Update and Finalize functions are supplied which use + * multiple versions of block and tail process subfunctions. + ******************************************************************/ + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + // Each function needs an individual C or ASM file because they impact performance much. + //They will be called by mh_sha1_murmur3_x64_128_update_XXX. + void mh_sha1_murmur3_x64_128_block (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_murmur3_x64_128_block_base (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires SSE + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires AVX + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires AVX2 + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + + /** + * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N + * + * @requires AVX512 + * + * @param input_data Pointer to input data to be processed + * @param mh_sha1_digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param murmur3_x64_128_digests Murmur3 digest + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data, + uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], + uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t num_blocks); + /******************************************************************* + * murmur hash API + ******************************************************************/ + + /** + * @brief Calculate murmur digest of blocks which size is 16*N. + * @param input_data Pointer to input data to be processed + * @param num_blocks The number of blocks which size is 16. + * @param murmur3_x64_128_digests Murmur3 digest + * @returns none + * + */ + void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks, + uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]); + + /** + * @brief Do the tail process which is less than 16Byte. + * @param tail_buffer Pointer to input data to be processed + * @param total_len The total length of the input_data + * @param digests Murmur3 digest + * @returns none + * + */ + void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len, + uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm new file mode 100644 index 000000000..6f9e54cdd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm @@ -0,0 +1,76 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" + +%ifidn __OUTPUT_FORMAT__, elf32 + [bits 32] +%else + default rel + [bits 64] + + extern mh_sha1_murmur3_x64_128_update_sse + extern mh_sha1_murmur3_x64_128_update_avx + extern mh_sha1_murmur3_x64_128_update_avx2 + extern mh_sha1_murmur3_x64_128_finalize_sse + extern mh_sha1_murmur3_x64_128_finalize_avx + extern mh_sha1_murmur3_x64_128_finalize_avx2 + + %ifdef HAVE_AS_KNOWS_AVX512 + extern mh_sha1_murmur3_x64_128_update_avx512 + extern mh_sha1_murmur3_x64_128_finalize_avx512 + %endif + +%endif + +extern mh_sha1_murmur3_x64_128_update_base +extern mh_sha1_murmur3_x64_128_finalize_base + +mbin_interface mh_sha1_murmur3_x64_128_update +mbin_interface mh_sha1_murmur3_x64_128_finalize + +%ifidn __OUTPUT_FORMAT__, elf64 + + %ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2, mh_sha1_murmur3_x64_128_update_avx512 + mbin_dispatch_init6 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2, mh_sha1_murmur3_x64_128_finalize_avx512 + %else + mbin_dispatch_init5 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2 + mbin_dispatch_init5 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2 + %endif + +%else + mbin_dispatch_init2 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base + mbin_dispatch_init2 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base +%endif + +;;; func core, ver, snum +slversion mh_sha1_murmur3_x64_128_update, 00, 02, 0252 +slversion mh_sha1_murmur3_x64_128_finalize, 00, 02, 0253 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c new file mode 100644 index 000000000..77ebb964e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c @@ -0,0 +1,206 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1_murmur3_x64_128.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_LEN 16*1024 +# define TEST_LOOPS 20000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_LEN 32*1024*1024 +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#define TEST_MEM TEST_LEN + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \ + printf("The stitch function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest); + +extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * murmur3_x64_128_digest); + +void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest) +{ + mh_sha1_ref(buffer, len, mh_sha1_digest); + murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest); + + return; +} + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS], + uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + int murmur3_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha1_fail++; + } + + for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) { + if (murmur3_test[i] != murmur3_base[i]) + murmur3_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("base: "); + dump((char *)hash_base, 20); + printf("ref: "); + dump((char *)hash_test, 20); + } + if (murmur3_fail) { + printf("murmur3 fail test\n"); + printf("base: "); + dump((char *)murmur3_base, 16); + printf("ref: "); + dump((char *)murmur3_test, 16); + } + + return mh_sha1_fail + murmur3_fail; +} + +int main(int argc, char *argv[]) +{ + int i, fail = 0; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS]; + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS], + murmur3_base[MURMUR3_x64_128_DIGEST_WORDS]; + uint8_t *buff = NULL; + struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL; + struct perf start, stop; + + printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n"); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + // mh_sha1_murmur3 base version + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base); + perf_start(&start); + for (i = 0; i < TEST_LOOPS / 10; i++) { + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, + murmur3_base); + } + perf_stop(&stop); + printf("mh_sha1_murmur3_x64_128_base" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + //Update feature test + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + } + perf_stop(&stop); + printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + // Check results + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size=%d\n", TEST_LEN); + return -1; + } + + if (fail) + printf("Test failed function test%d\n", fail); + else + printf("Pass func check\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c new file mode 100644 index 000000000..22ab6d1f9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c @@ -0,0 +1,248 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1_murmur3_x64_128.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \ + printf("The stitch function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest); + +extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * murmur3_x64_128_digest); + +void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest) +{ + mh_sha1_ref(buffer, len, mh_sha1_digest); + murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest); + + return; +} + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS], + uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + int murmur3_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha1_fail++; + } + + for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) { + if (murmur3_test[i] != murmur3_base[i]) + murmur3_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("base: "); + dump((char *)hash_base, 20); + printf("ref: "); + dump((char *)hash_test, 20); + } + if (murmur3_fail) { + printf("murmur3 fail test\n"); + printf("base: "); + dump((char *)murmur3_base, 16); + printf("ref: "); + dump((char *)murmur3_test, 16); + } + + return mh_sha1_fail + murmur3_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS]; + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS], + murmur3_base[MURMUR3_x64_128_DIGEST_WORDS]; + uint8_t *buff = NULL; + int size, offset; + struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL; + + printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base); + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages + for (size = TEST_LEN; size >= 0; size--) { + + // Fill with rand data + rand_buffer(buff, size); + + mh_sha1_murmur3_x64_128_base(buff, size, TEST_SEED, hash_base, murmur3_base); + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various buffer offsets and sizes + printf("offset tests"); + for (size = TEST_LEN - 256; size > 256; size -= 11) { + for (offset = 0; offset < 256; offset++) { + mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED, + hash_base, murmur3_base); + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = + compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size=%d offset=%d\n", size, offset); + return -1; + } + + } + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + // Run efence tests + printf("efence tests"); + for (size = TEST_SIZE; size > 0; size--) { + offset = TEST_LEN - size; + mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED, + hash_base, murmur3_base); + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size=%d offset=%d\n", size, offset); + return -1; + } + + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c new file mode 100644 index 000000000..0e7a3970d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c @@ -0,0 +1,107 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef UPDATE_FUNCTION +#include "mh_sha1_murmur3_x64_128_internal.h" +#include + +#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_base +#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_base +#define UPDATE_FUNCTION_SLVER +#endif + +int UPDATE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer, uint32_t len) +{ + + uint8_t *partial_block_buffer; + uint64_t partial_block_len; + uint64_t num_blocks; + uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + uint32_t *murmur3_x64_128_digest; + const uint8_t *input_data = (const uint8_t *)buffer; + + if (ctx == NULL) + return MH_SHA1_MURMUR3_CTX_ERROR_NULL; + + if (len == 0) + return MH_SHA1_MURMUR3_CTX_ERROR_NONE; + + partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE; + partial_block_buffer = ctx->partial_block_buffer; + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; + murmur3_x64_128_digest = ctx->murmur3_x64_128_digest; + + ctx->total_length += len; + // No enough input data for mh_sha1 calculation + if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) { + memcpy(partial_block_buffer + partial_block_len, input_data, len); + return MH_SHA1_MURMUR3_CTX_ERROR_NONE; + } + // mh_sha1 calculation for the previous partial block + if (partial_block_len != 0) { + memcpy(partial_block_buffer + partial_block_len, input_data, + MH_SHA1_BLOCK_SIZE - partial_block_len); + //do one_block process + BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests, + aligned_frame_buffer, murmur3_x64_128_digest, 1); + input_data += MH_SHA1_BLOCK_SIZE - partial_block_len; + len -= MH_SHA1_BLOCK_SIZE - partial_block_len; + memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE); + } + // Calculate mh_sha1 for the current blocks + num_blocks = len / MH_SHA1_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer, + murmur3_x64_128_digest, num_blocks); + len -= num_blocks * MH_SHA1_BLOCK_SIZE; + input_data += num_blocks * MH_SHA1_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + return MH_SHA1_MURMUR3_CTX_ERROR_NONE; + +} + +#ifdef UPDATE_FUNCTION_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + + // Version info +struct slver mh_sha1_murmur3_x64_128_update_base_slver_0000025a; +struct slver mh_sha1_murmur3_x64_128_update_base_slver = { 0x025a, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c new file mode 100644 index 000000000..6ae888e21 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c @@ -0,0 +1,272 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha1_murmur3_x64_128.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA1_FUNC_TYPE +#define MH_SHA1_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \ + printf("The stitch function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest); + +extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * murmur3_x64_128_digest); + +void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest) +{ + mh_sha1_ref(buffer, len, mh_sha1_digest); + murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest); + + return; +} + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS], + uint32_t hash_test[SHA1_DIGEST_WORDS], + uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS], + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS]) +{ + int i; + int mh_sha1_fail = 0; + int murmur3_fail = 0; + + for (i = 0; i < SHA1_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha1_fail++; + } + + for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) { + if (murmur3_test[i] != murmur3_base[i]) + murmur3_fail++; + } + + if (mh_sha1_fail) { + printf("mh_sha1 fail test\n"); + printf("base: "); + dump((char *)hash_base, 20); + printf("ref: "); + dump((char *)hash_test, 20); + } + if (murmur3_fail) { + printf("murmur3 fail test\n"); + printf("base: "); + dump((char *)murmur3_base, 16); + printf("ref: "); + dump((char *)murmur3_test, 16); + } + + return mh_sha1_fail + murmur3_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0, i; + uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS]; + uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS], + murmur3_base[MURMUR3_x64_128_DIGEST_WORDS]; + uint8_t *buff = NULL; + int update_count; + int size1, size2, offset, addr_offset; + struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL; + uint8_t *mem_addr = NULL; + + printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base); + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages by update twice. + printf("\n various size messages by update twice tests"); + for (size1 = TEST_LEN; size1 >= 0; size1--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, + murmur3_base); + + // subsequent update + size2 = TEST_LEN - size1; // size2 is different with the former + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various update count + printf("\n various update count tests"); + for (update_count = 1; update_count <= TEST_LEN; update_count++) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, + murmur3_base); + + // subsequent update + size1 = TEST_LEN / update_count; + size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former + + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + for (i = 1, offset = 0; i < update_count; i++) { + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1)); + offset += size1; + } + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // test various start address of ctx. + printf("\n various start address of ctx test"); + free(update_ctx); + mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10); + for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, + murmur3_base); + + // a unaligned offset + update_ctx = (struct mh_sha1_murmur3_x64_128_ctx *)(mem_addr + addr_offset); + CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test)); + + fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test); + + if (fail) { + printf("Fail addr_offset=%d\n", addr_offset); + return -1; + } + + if ((addr_offset & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c new file mode 100644 index 000000000..f5fe30a83 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c @@ -0,0 +1,85 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include // for NULL +#include "murmur3_x64_128_internal.c" + +#if (__GNUC__ >= 11) +# define OPT_FIX2 __attribute__ ((optimize(1))) +#else +# define OPT_FIX2 +#endif + +/******************************************************************* + * Single API which can calculate murmur3 + ******************************************************************/ +/** + * @brief Get the digest of murmur3_x64_128 through a single API. + * + * Using murmur3_x64_128_block and murmur3_x64_128_tail. + * Used to test the murmur3_x64_128 digest. + * + * @param buffer Pointer to buffer to be processed + * @param len Length of buffer (in bytes) to be processed + * @param murmur_seed Seed as an initial digest of murmur3 + * @param murmur3_x64_128_digest The digest of murmur3_x64_128 + * @returns none + * + */ +void OPT_FIX2 murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed, + uint32_t * murmur3_x64_128_digest) +{ + uint64_t *murmur3_x64_128_hash; + uint32_t murmur3_x64_128_hash_dword[4]; + uint8_t *tail_buffer; + const uint8_t *input_data = (const uint8_t *)buffer; + + // Initiate murmur3 + murmur3_x64_128_hash = (uint64_t *) murmur3_x64_128_hash_dword; + murmur3_x64_128_hash[0] = murmur_seed; + murmur3_x64_128_hash[1] = murmur_seed; + + // process bodies + murmur3_x64_128_block((uint8_t *) input_data, len / MUR_BLOCK_SIZE, + murmur3_x64_128_hash_dword); + + // process finalize + tail_buffer = (uint8_t *) input_data + len - len % MUR_BLOCK_SIZE; + murmur3_x64_128_tail(tail_buffer, len, murmur3_x64_128_hash_dword); + + // output the digests + if (murmur3_x64_128_digest != NULL) { + murmur3_x64_128_digest[0] = murmur3_x64_128_hash_dword[0]; + murmur3_x64_128_digest[1] = murmur3_x64_128_hash_dword[1]; + murmur3_x64_128_digest[2] = murmur3_x64_128_hash_dword[2]; + murmur3_x64_128_digest[3] = murmur3_x64_128_hash_dword[3]; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c new file mode 100644 index 000000000..67eabd0c4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c @@ -0,0 +1,138 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha1_murmur3_x64_128_internal.h" +#include // for NULL + +/* murmur3_x64_128 constants */ +// Shift bits of circle rotate +#define MUR_SH1 31 +#define MUR_SH2 33 +#define MUR_SH3 27 +#define MUR_SH4 31 +#define MUR_SH5 33 + +#define MUR_MUL 5 +#define MUR_ADD1 0x52dce729 +#define MUR_ADD2 0x38495ab5 + +#define MUR_CON1 0x87c37b91114253d5LLU +#define MUR_CON2 0x4cf5ad432745937fLLU + +#define MUR_FMUL1 0xff51afd7ed558ccdLLU +#define MUR_FMUL2 0xc4ceb9fe1a85ec53LLU + +/* murmur3_x64_128 inline functions */ +static inline uint64_t blockmix64(uint64_t data, uint64_t conA, uint64_t conB, uint64_t shift) +{ + data *= conA; + data = (data << shift) | (data >> (64 - shift)); + data *= conB; + return data; +} + +static inline uint64_t hashmix64(uint64_t hashA, uint64_t hashB, uint64_t data, uint64_t add, + uint64_t shift) +{ + hashA ^= data; + hashA = (hashA << shift) | (hashA >> (64 - shift)); + hashA += hashB; + hashA = hashA * MUR_MUL + add; + return hashA; +} + +void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks, + uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]) +{ + uint64_t data1, data2; + uint64_t *input_qword = (uint64_t *) input_data; + uint64_t *hash = (uint64_t *) digests; + uint32_t i = 0; + + while (i < num_blocks) { + data1 = input_qword[i * 2]; + data2 = input_qword[i * 2 + 1]; + data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1); + data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2); + hash[0] = hashmix64(hash[0], hash[1], data1, MUR_ADD1, MUR_SH3); + hash[1] = hashmix64(hash[1], hash[0], data2, MUR_ADD2, MUR_SH4); + i++; + } + + return; +} + +void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len, + uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]) +{ + uint64_t data1, data2; + uint64_t *hash = (uint64_t *) digests; + uint64_t tail_len = total_len % 16; + uint8_t *tail = (uint8_t *) tail_buffer; + + union { + uint64_t hash[2]; + uint8_t hashB[16]; + } hashU; + + // tail + hashU.hash[0] = hashU.hash[1] = 0; + + while (tail_len-- > 0) + hashU.hashB[tail_len] = tail[tail_len]; + + data1 = hashU.hash[0]; + data2 = hashU.hash[1]; + + data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1); + data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2); + + hash[0] ^= total_len ^ data1; + hash[1] ^= total_len ^ data2; + + hash[0] += hash[1]; + hash[1] += hash[0]; + + hash[0] ^= hash[0] >> MUR_SH5; + hash[0] *= MUR_FMUL1; + hash[0] ^= hash[0] >> MUR_SH5; + hash[0] *= MUR_FMUL2; + hash[0] ^= hash[0] >> MUR_SH5; + + hash[1] ^= hash[1] >> MUR_SH5; + hash[1] *= MUR_FMUL1; + hash[1] ^= hash[1] >> MUR_SH5; + hash[1] *= MUR_FMUL2; + hash[1] ^= hash[1] >> MUR_SH5; + + hash[0] += hash[1]; + hash[1] += hash[0]; + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am new file mode 100644 index 000000000..d6e8b61ab --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am @@ -0,0 +1,88 @@ +######################################################################## +# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_sha256 = mh_sha256/sha256_for_mh_sha256.c + +lsrc_mh_sha256 = mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_block_sse.asm \ + mh_sha256/mh_sha256_block_avx.asm \ + mh_sha256/mh_sha256_block_avx2.asm \ + mh_sha256/mh_sha256_multibinary.asm \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +lsrc_mh_sha256 += mh_sha256/mh_sha256_block_avx512.asm \ + mh_sha256/mh_sha256_avx512.c + +lsrc_x86_64 += $(lsrc_sha256) \ + $(lsrc_mh_sha256) + +lsrc_x86_32 += $(lsrc_x86_64) + +other_src += mh_sha256/mh_sha256_ref.c \ + include/reg_sizes.asm \ + include/multibinary.asm \ + include/test.h \ + mh_sha256/mh_sha256_internal.h + +lsrc_aarch64 += $(lsrc_sha256) \ + mh_sha256/aarch64/mh_sha256_multibinary.S \ + mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c \ + mh_sha256/aarch64/mh_sha256_block_ce.S \ + mh_sha256/aarch64/mh_sha256_ce.c \ + mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +lsrc_base_aliases += $(lsrc_sha256) \ + mh_sha256/mh_sha256_base_aliases.c \ + mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +src_include += -I $(srcdir)/mh_sha256 + +extern_hdrs += include/mh_sha256.h + +check_tests += mh_sha256/mh_sha256_test +unit_tests += mh_sha256/mh_sha256_update_test + +perf_tests += mh_sha256/mh_sha256_perf + + +mh_sha256_test: mh_sha256_ref.o +mh_sha256_mh_sha256_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la + +mh_sha256_update_test: mh_sha256_ref.o +mh_sha256_mh_sha256_update_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la + +mh_sha256_mh_sha256_perf_LDADD = libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c new file mode 100644 index 000000000..155790fc1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(mh_sha256_update) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(mh_sha256_update_ce); + + return PROVIDER_BASIC(mh_sha256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(mh_sha256_finalize) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(mh_sha256_finalize_ce); + + return PROVIDER_BASIC(mh_sha256_finalize); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S new file mode 100644 index 000000000..53a78ea7d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S @@ -0,0 +1,731 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 6 + + .global mh_sha256_block_ce + .type mh_sha256_block_ce, %function + +/* +Macros +*/ + +.macro declare_vector_reg name:req,reg:req,default:req + \name .req \default\reg + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + +declare_vector_reg lane0_msg0, 0,v +declare_vector_reg lane1_msg0, 1,v +declare_vector_reg lane2_msg0, 2,v +declare_vector_reg lane3_msg0, 3,v + +declare_vector_reg lane0_msg1, 4,v +declare_vector_reg lane1_msg1, 5,v +declare_vector_reg lane2_msg1, 6,v +declare_vector_reg lane3_msg1, 7,v + +declare_vector_reg lane0_msg2, 8,v +declare_vector_reg lane1_msg2, 9,v +declare_vector_reg lane2_msg2, 10,v +declare_vector_reg lane3_msg2, 11,v + +declare_vector_reg lane0_msg3, 12,v +declare_vector_reg lane1_msg3, 13,v +declare_vector_reg lane2_msg3, 14,v +declare_vector_reg lane3_msg3, 15,v + +declare_vector_reg lane0_state0, 16,v +declare_vector_reg lane1_state0, 17,v +declare_vector_reg lane2_state0, 18,v +declare_vector_reg lane3_state0, 19,v + +declare_vector_reg lane0_state1, 20,v +declare_vector_reg lane1_state1, 21,v +declare_vector_reg lane2_state1, 22,v +declare_vector_reg lane3_state1, 23,v + +declare_vector_reg lane0_tmp0, 24,v +declare_vector_reg lane1_tmp0, 25,v +declare_vector_reg lane2_tmp0, 26,v +declare_vector_reg lane3_tmp0, 27,v + +declare_vector_reg lane0_tmp2, 28,v +declare_vector_reg lane1_tmp2, 29,v +declare_vector_reg lane2_tmp2, 30,v +declare_vector_reg lane3_tmp2, 31,v + +declare_vector_reg key, 27,v +declare_vector_reg tmp, 29,v + +/* +void mh_sha256_block_ce(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], + uint32_t num_blocks); +*/ + x_input_data .req x0 + x_digests .req x1 + x_frame_buffer .req x2 + w_num_blocks .req w3 + + x_digest_addr .req x4 + x_key_addr .req x5 + x_msg_addr .req x6 + x_lane_offs .req x7 + x_offs .req x9 + w_input_data_end .req w10 + x_input_data_end .req x10 + x_tmp .req x11 +mh_sha256_block_ce: + cbz w_num_blocks, .exit + mov w_input_data_end, w_num_blocks + + ubfiz x_input_data_end, x_input_data_end, 10, 32 + add x_input_data_end, x_input_data, x_input_data_end + + adrp x_key_addr, .key_addr + add x_key_addr, x_key_addr, :lo12:.key_addr + + stp d8, d9, [sp, -192]! + + stp d10, d11, [sp, 16] + stp d12, d13, [sp, 32] + stp d14, d15, [sp, 48] + + .p2align 3,,7 +.start_loop: + mov x_lane_offs, 0 + mov x_digest_addr, x_digests + +.lane_loop: + add x_msg_addr, x_input_data, x_lane_offs, lsl 2 + + .p2align 3,,7 + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs + + add x_tmp, x_digest_addr, 256 + ld4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs + + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[3], [x_msg_addr], x_offs + + // reverse for little endian + rev32 v_lane0_msg0.16b, v_lane0_msg0.16b + rev32 v_lane1_msg0.16b, v_lane1_msg0.16b + rev32 v_lane2_msg0.16b, v_lane2_msg0.16b + rev32 v_lane3_msg0.16b, v_lane3_msg0.16b + + rev32 v_lane0_msg1.16b, v_lane0_msg1.16b + rev32 v_lane1_msg1.16b, v_lane1_msg1.16b + rev32 v_lane2_msg1.16b, v_lane2_msg1.16b + rev32 v_lane3_msg1.16b, v_lane3_msg1.16b + + rev32 v_lane0_msg2.16b, v_lane0_msg2.16b + rev32 v_lane1_msg2.16b, v_lane1_msg2.16b + rev32 v_lane2_msg2.16b, v_lane2_msg2.16b + rev32 v_lane3_msg2.16b, v_lane3_msg2.16b + + rev32 v_lane0_msg3.16b, v_lane0_msg3.16b + rev32 v_lane1_msg3.16b, v_lane1_msg3.16b + rev32 v_lane2_msg3.16b, v_lane2_msg3.16b + rev32 v_lane3_msg3.16b, v_lane3_msg3.16b + + // rounds 0-3 + ldr q_key, [x_key_addr] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + str q_lane0_state1, [sp, 64] + str q_lane1_state1, [sp, 80] + str q_lane2_state1, [sp, 96] + str q_lane3_state1, [sp, 112] + + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[0], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[1], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[2], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[3], [x_tmp], x_offs + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 4-7 + ldr q_key, [x_key_addr, 16] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 8-11 + ldr q_key, [x_key_addr, 32] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 12-15 + ldr q_key, [x_key_addr, 48] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 16-19 + ldr q_key, [x_key_addr, 64] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 20-23 + ldr q_key, [x_key_addr, 80] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 24-27 + ldr q_key, [x_key_addr, 96] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 28-31 + ldr q_key, [x_key_addr, 112] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 32-35 + ldr q_key, [x_key_addr, 128] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 36-39 + ldr q_key, [x_key_addr, 144] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 40-43 + ldr q_key, [x_key_addr, 160] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 44-47 + ldr q_key, [x_key_addr, 176] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 48-51 + ldr q_key, [x_key_addr, 192] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 52-55 + ldr q_key, [x_key_addr, 208] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 56-59 + ldr q_key, [x_key_addr, 224] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 60-63 + ldr q_key, [x_key_addr, 240] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[0], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[1], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[2], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[3], [x_tmp], x_offs + + add v_lane0_state0.4s, v_lane0_tmp0.4s, v_lane0_state0.4s + add v_lane1_state0.4s, v_lane1_tmp0.4s, v_lane1_state0.4s + add v_lane2_state0.4s, v_lane2_tmp0.4s, v_lane2_state0.4s + add v_lane3_state0.4s, v_lane3_tmp0.4s, v_lane3_state0.4s + + mov x_offs, 64 + mov x_tmp, x_digest_addr + st4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs + + ldp q_lane0_tmp2, q_lane1_tmp2, [sp, 64] + ldp q_lane2_tmp2, q_lane3_tmp2, [sp, 96] + + add v_lane0_state1.4s, v_lane0_tmp2.4s, v_lane0_state1.4s + add v_lane1_state1.4s, v_lane1_tmp2.4s, v_lane1_state1.4s + add v_lane2_state1.4s, v_lane2_tmp2.4s, v_lane2_state1.4s + add v_lane3_state1.4s, v_lane3_tmp2.4s, v_lane3_state1.4s + + mov x_offs, 64 + add x_tmp, x_digest_addr, 256 + st4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs + + add x_digest_addr, x_digest_addr, 16 + add x_lane_offs, x_lane_offs, 4 + cmp x_lane_offs, 16 + bne .lane_loop + + add x_input_data, x_input_data, 1024 + cmp x_input_data, x_input_data_end + bne .start_loop + + ldp d10, d11, [sp, 16] + ldp d12, d13, [sp, 32] + ldp d14, d15, [sp, 48] + ldp d8, d9, [sp], 192 +.exit: + ret + .size mh_sha256_block_ce, .-mh_sha256_block_ce + + .section .rodata + .align 4 + .set .key_addr,. + 0 + .type K, %object + .size K, 256 +K: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c new file mode 100644 index 000000000..c42333ed5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c @@ -0,0 +1,53 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include "mh_sha256_internal.h" + +void mh_sha256_block_ce(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +/***************mh_sha256_update***********/ +// mh_sha256_update_ce.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_ce +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail +// mh_sha256_finalize_ce.c and mh_sha256_tail_ce.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_ce +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_ce +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S new file mode 100644 index 000000000..54eece175 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface mh_sha256_update +mbin_interface mh_sha256_finalize diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c new file mode 100644 index 000000000..242c3e218 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c @@ -0,0 +1,143 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha256_internal.h" + +int mh_sha256_init(struct mh_sha256_ctx *ctx) +{ + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint32_t i; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + memset(ctx, 0, sizeof(*ctx)); + + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + for (i = 0; i < HASH_SEGS; i++) { + mh_sha256_segs_digests[0][i] = MH_SHA256_H0; + mh_sha256_segs_digests[1][i] = MH_SHA256_H1; + mh_sha256_segs_digests[2][i] = MH_SHA256_H2; + mh_sha256_segs_digests[3][i] = MH_SHA256_H3; + mh_sha256_segs_digests[4][i] = MH_SHA256_H4; + mh_sha256_segs_digests[5][i] = MH_SHA256_H5; + mh_sha256_segs_digests[6][i] = MH_SHA256_H6; + mh_sha256_segs_digests[7][i] = MH_SHA256_H7; + } + + return MH_SHA256_CTX_ERROR_NONE; +} + +#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \ + || defined( _M_X64) || defined(_M_IX86)) +/***************mh_sha256_update***********/ +// mh_sha256_update_sse.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_sse +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_update_avx.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_update_avx2.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx2 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2 +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail + +// mh_sha256_finalize_sse.c and mh_sha256_tail_sse.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_sse +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_sse +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_finalize_avx.c and mh_sha256_tail_avx.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_finalize_avx2.c and mh_sha256_tail_avx2.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx2 +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx2 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2 +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************version info***********/ + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +// Version info +struct slver mh_sha256_init_slver_000002b1; +struct slver mh_sha256_init_slver = { 0x02b1, 0x00, 0x00 }; + +// mh_sha256_update version info +struct slver mh_sha256_update_sse_slver_000002b4; +struct slver mh_sha256_update_sse_slver = { 0x02b4, 0x00, 0x00 }; + +struct slver mh_sha256_update_avx_slver_020002b6; +struct slver mh_sha256_update_avx_slver = { 0x02b6, 0x00, 0x02 }; + +struct slver mh_sha256_update_avx2_slver_040002b8; +struct slver mh_sha256_update_avx2_slver = { 0x02b8, 0x00, 0x04 }; + +// mh_sha256_finalize version info +struct slver mh_sha256_finalize_sse_slver_000002b5; +struct slver mh_sha256_finalize_sse_slver = { 0x02b5, 0x00, 0x00 }; + +struct slver mh_sha256_finalize_avx_slver_020002b7; +struct slver mh_sha256_finalize_avx_slver = { 0x02b7, 0x00, 0x02 }; + +struct slver mh_sha256_finalize_avx2_slver_040002b9; +struct slver mh_sha256_finalize_avx2_slver = { 0x02b9, 0x00, 0x04 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c new file mode 100644 index 000000000..35fb0fbad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c @@ -0,0 +1,70 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha256_internal.h" + +#ifdef HAVE_AS_KNOWS_AVX512 + +/***************mh_sha256_update***********/ +// mh_sha256_update_avx512.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx512 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512 +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail +// mh_sha256_finalize_avx512.c and mh_sha256_tail_avx512.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx512 +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx512 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512 +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************version info***********/ +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// mh_sha256_update version info +struct slver mh_sha256_update_avx512_slver_060002bc; +struct slver mh_sha256_update_avx512_slver = { 0x02bc, 0x00, 0x06 }; + +// mh_sha256_finalize version info +struct slver mh_sha256_finalize_avx512_slver_060002bd; +struct slver mh_sha256_finalize_avx512_slver = { 0x02bd, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c new file mode 100644 index 000000000..343ffb024 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c @@ -0,0 +1,40 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "mh_sha256_internal.h" +#include +int mh_sha256_update(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len) +{ + return mh_sha256_update_base(ctx, buffer, len); + +} + +int mh_sha256_finalize(struct mh_sha256_ctx *ctx, void *mh_sha256_digest) +{ + return mh_sha256_finalize_base(ctx, mh_sha256_digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm new file mode 100644 index 000000000..c2eff350d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm @@ -0,0 +1,557 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa %%T1, [SZ4*(%%i&0xf) + %%data] + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + %%data], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + %%data] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + %%data] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + %%data] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_avx, function, internal +func(mh_sha256_block_avx) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by avx + and rsp, ~0x0F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 8 + VMOVPS a, [mh_digests_p + I*64 + 16*0] + VMOVPS b, [mh_digests_p + I*64 + 16*1] + VMOVPS c, [mh_digests_p + I*64 + 16*2] + VMOVPS d, [mh_digests_p + I*64 + 16*3] + + vmovdqa [rsp + I*64 + 16*0], a + vmovdqa [rsp + I*64 + 16*1], b + vmovdqa [rsp + I*64 + 16*2], c + vmovdqa [rsp + I*64 + 16*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4 + %assign I 0 + %rep 16 + VMOVPS TT0,[mh_in_p + I*64+0*16] + VMOVPS TT1,[mh_in_p + I*64+1*16] + VMOVPS TT2,[mh_in_p + I*64+2*16] + VMOVPS TT3,[mh_in_p + I*64+3*16] + + vpshufb TT0, TMP + vmovdqa [mh_data_p +(I)*16 +0*256],TT0 + vpshufb TT1, TMP + vmovdqa [mh_data_p +(I)*16 +1*256],TT1 + vpshufb TT2, TMP + vmovdqa [mh_data_p +(I)*16 +2*256],TT2 + vpshufb TT3, TMP + vmovdqa [mh_data_p +(I)*16 +3*256],TT3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + vmovdqa a, [rsp + 0*64 + mh_segs] + vmovdqa b, [rsp + 1*64 + mh_segs] + vmovdqa c, [rsp + 2*64 + mh_segs] + vmovdqa d, [rsp + 3*64 + mh_segs] + vmovdqa e, [rsp + 4*64 + mh_segs] + vmovdqa f, [rsp + 5*64 + mh_segs] + vmovdqa g, [rsp + 6*64 + mh_segs] + vmovdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + %if i = 48 + PREFETCH_X [mh_in_p + pref+128*1] + %endif + ROUND_16_XX T1, i, mh_data_p + %assign i (i+1) + %endrep + + ;; add old digest + vpaddd a, a, [rsp + 0*64 + mh_segs] + vpaddd b, b, [rsp + 1*64 + mh_segs] + vpaddd c, c, [rsp + 2*64 + mh_segs] + vpaddd d, d, [rsp + 3*64 + mh_segs] + vpaddd e, e, [rsp + 4*64 + mh_segs] + vpaddd f, f, [rsp + 5*64 + mh_segs] + vpaddd g, g, [rsp + 6*64 + mh_segs] + vpaddd h, h, [rsp + 7*64 + mh_segs] + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], a + vmovdqa [rsp + 1*64 + mh_segs], b + vmovdqa [rsp + 2*64 + mh_segs], c + vmovdqa [rsp + 3*64 + mh_segs], d + vmovdqa [rsp + 4*64 + mh_segs], e + vmovdqa [rsp + 5*64 + mh_segs], f + vmovdqa [rsp + 6*64 + mh_segs], g + vmovdqa [rsp + 7*64 + mh_segs], h + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 8 + vmovdqa a, [rsp + I*64 + 16*0] + vmovdqa b, [rsp + I*64 + 16*1] + vmovdqa c, [rsp + I*64 + 16*2] + vmovdqa d, [rsp + I*64 + 16*3] + + VMOVPS [mh_digests_p + I*64 + 16*0], a + VMOVPS [mh_digests_p + I*64 + 16*1], b + VMOVPS [mh_digests_p + I*64 + 16*2], c + VMOVPS [mh_digests_p + I*64 + 16*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm new file mode 100644 index 000000000..c2b3f2c59 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm @@ -0,0 +1,616 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX-2 +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define SZ 4 +%define SZ8 8*SZ +%define ROUNDS 64*SZ8 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define a0 ymm8 +%define a1 ymm9 +%define a2 ymm10 + +%define TT0 ymm14 +%define TT1 ymm13 +%define TT2 ymm12 +%define TT3 ymm11 +%define TT4 ymm10 +%define TT5 ymm9 + +%define T1 ymm14 +%define TMP ymm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa %%T1, [SZ8*(%%i&0xf) + %%data] + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ8*(%%i&0xf) + %%data], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + %%data] + vmovdqa a1, [SZ8*((%%i-2)&0xf) + %%data] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data] + vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + %%data] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_avx2, function, internal +func(mh_sha256_block_avx2) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 32 Bytes needed by avx2 + and rsp, ~0x1F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 4 + VMOVPS a, [mh_digests_p + I*64*2 + 32*0] + VMOVPS b, [mh_digests_p + I*64*2 + 32*1] + VMOVPS c, [mh_digests_p + I*64*2 + 32*2] + VMOVPS d, [mh_digests_p + I*64*2 + 32*3] + + vmovdqa [rsp + I*64*2 + 32*0], a + vmovdqa [rsp + I*64*2 + 32*1], b + vmovdqa [rsp + I*64*2 + 32*2], c + vmovdqa [rsp + I*64*2 + 32*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2 + %assign I 0 + %rep 16 + VMOVPS TT0,[mh_in_p + I*64+0*32] + VMOVPS TT1,[mh_in_p + I*64+1*32] + + vpshufb TT0, TT0, TMP + vmovdqa [mh_data_p +I*32 +0*512],TT0 + vpshufb TT1, TT1, TMP + vmovdqa [mh_data_p +I*32 +1*512],TT1 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 8 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + vmovdqa a, [rsp + 0*64 + mh_segs] + vmovdqa b, [rsp + 1*64 + mh_segs] + vmovdqa c, [rsp + 2*64 + mh_segs] + vmovdqa d, [rsp + 3*64 + mh_segs] + vmovdqa e, [rsp + 4*64 + mh_segs] + vmovdqa f, [rsp + 5*64 + mh_segs] + vmovdqa g, [rsp + 6*64 + mh_segs] + vmovdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + ROUND_16_XX T1, i, mh_data_p + %if i % 16 = 8 + PREFETCH_X [mh_in_p + pref+128*(i/16)] + %endif + %assign i (i+1) + %endrep + + ;; add old digest + vpaddd a, a, [rsp + 0*64 + mh_segs] + vpaddd b, b, [rsp + 1*64 + mh_segs] + vpaddd c, c, [rsp + 2*64 + mh_segs] + vpaddd d, d, [rsp + 3*64 + mh_segs] + vpaddd e, e, [rsp + 4*64 + mh_segs] + vpaddd f, f, [rsp + 5*64 + mh_segs] + vpaddd g, g, [rsp + 6*64 + mh_segs] + vpaddd h, h, [rsp + 7*64 + mh_segs] + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], a + vmovdqa [rsp + 1*64 + mh_segs], b + vmovdqa [rsp + 2*64 + mh_segs], c + vmovdqa [rsp + 3*64 + mh_segs], d + vmovdqa [rsp + 4*64 + mh_segs], e + vmovdqa [rsp + 5*64 + mh_segs], f + vmovdqa [rsp + 6*64 + mh_segs], g + vmovdqa [rsp + 7*64 + mh_segs], h + + add pref, 512 + add mh_data_p, 512 + add mh_segs, 32 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 4 + vmovdqa a, [rsp + I*64*2 + 32*0] + vmovdqa b, [rsp + I*64*2 + 32*1] + vmovdqa c, [rsp + I*64*2 + 32*2] + vmovdqa d, [rsp + I*64*2 + 32*3] + + VMOVPS [mh_digests_p + I*64*2 + 32*0], a + VMOVPS [mh_digests_p + I*64*2 + 32*1], b + VMOVPS [mh_digests_p + I*64*2 + 32*2], c + VMOVPS [mh_digests_p + I*64*2 + 32*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm new file mode 100644 index 000000000..1ee76ddfc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm @@ -0,0 +1,682 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX-512 +;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define APPEND(a,b) a %+ b +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 +;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 +;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 +;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 + +; Main processing loop per round +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddd T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, T1, %%WT ; T1 = T1 + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpaddd D, D, T1 ; D = D + T1 + + vprord H, A, 2 ; ROR_2(A) + vprord TMP2, A, 13 ; ROR_13(A) + vprord TMP3, A, 22 ; ROR_22(A) + vmovdqa32 TMP0, A + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_63 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +[bits 64] +section .text +align 32 + +;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +global mh_sha256_block_avx512 +func(mh_sha256_block_avx512) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 64 Bytes needed by avx512 + and rsp, ~0x3F + lea TBL,[TABLE] + + ; copy segs_digests into stack and ZMM + VMOVPS A, [mh_digests_p + 64*0] + VMOVPS B, [mh_digests_p + 64*1] + VMOVPS C, [mh_digests_p + 64*2] + VMOVPS D, [mh_digests_p + 64*3] + VMOVPS E, [mh_digests_p + 64*4] + VMOVPS F, [mh_digests_p + 64*5] + VMOVPS G, [mh_digests_p + 64*6] + VMOVPS H, [mh_digests_p + 64*7] + +.block_loop: + ; Save digests for later addition + vmovdqa32 [rsp + 64*0], A + vmovdqa32 [rsp + 64*1], B + vmovdqa32 [rsp + 64*2], C + vmovdqa32 [rsp + 64*3], D + vmovdqa32 [rsp + 64*4], E + vmovdqa32 [rsp + 64*5], F + vmovdqa32 [rsp + 64*6], G + vmovdqa32 [rsp + 64*7], H + + vmovdqa32 TMP3, [TBL] ; First K + ;transform to big-endian data and store on aligned_frame + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + ;using extra 16 ZMM registers instead of heap +%assign I 0 +%rep 8 +%assign J (I+1) + VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64] + VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64] + + vpshufb APPEND(W,I), APPEND(W,I), TMP2 + vpshufb APPEND(W,J), APPEND(W,J), TMP2 +%assign I (I+2) +%endrep + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first 48 rounds + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep 64 + PROCESS_LOOP APPEND(W,J), I + %if I < 48 + MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %endif + %if I % 8 = 4 + PREFETCH_X [mh_in_p + 1024+128*(I / 8)] + %endif +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + + ;; add old digest + vpaddd A, A, [rsp + 0*64] + vpaddd B, B, [rsp + 1*64] + vpaddd C, C, [rsp + 2*64] + vpaddd D, D, [rsp + 3*64] + vpaddd E, E, [rsp + 4*64] + vpaddd F, F, [rsp + 5*64] + vpaddd G, G, [rsp + 6*64] + vpaddd H, H, [rsp + 7*64] + + add mh_in_p, 1024 + sub loops, 1 + jne .block_loop + + ; copy segs_digests back to mh_digests_p + + VMOVPS [mh_digests_p + 64*0], A + VMOVPS [mh_digests_p + 64*1], B + VMOVPS [mh_digests_p + 64*2], C + VMOVPS [mh_digests_p + 64*3], D + VMOVPS [mh_digests_p + 64*4], E + VMOVPS [mh_digests_p + 64*5], F + VMOVPS [mh_digests_p + 64*6], G + VMOVPS [mh_digests_p + 64*7], H + + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_mh_sha256_block_avx512 +no_mh_sha256_block_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c new file mode 100644 index 000000000..8d9a828c6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c @@ -0,0 +1,188 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha256_internal.h" +#include + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Base multi-hash SHA256 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// store_w is only used for step 0 ~ 15 +#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s])) +#define Ws(x, s) w[(x) & 15][s] +// update_w is used for step > 15 +#define update_w(s, i, w) \ + Ws(i, s) = Ws(i-16, s) + S0(Ws(i-15, s)) + Ws(i-7, s) + S1(Ws(i-2, s)) +#define update_t2(s, a, b, c) t2[s] = s0(a[s]) + maj(a[s],b[s],c[s]) +#define update_t1(s, h, e, f, g, i, k) \ + t1[s] = h[s] + s1(e[s]) + ch(e[s],f[s],g[s]) + k + Ws(i, s); +#define update_d(s) d[s] += t1[s] +#define update_h(s) h[s] = t1[s] + t2[s] + +// s is a iterator +#define STORE_W(s, i, w, ww) \ + for(s = 0; s < HASH_SEGS; s++) \ + store_w(s, i, w, ww); +#define UPDATE_W(s, i, w) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_w(s, i, w); +#define UPDATE_T2(s, a, b, c) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_t2(s, a, b, c); +#define UPDATE_T1(s, h, e, f, g, i, k) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_t1(s, h, e, f, g, i, k); +#define UPDATE_D(s) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_d(s); +#define UPDATE_H(s) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_h(s); + +static inline void step(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t * f, + uint32_t * g, uint32_t * h, uint32_t k, + uint32_t * t1, uint32_t * t2, uint32_t(*w)[HASH_SEGS], uint32_t * ww) +{ + uint8_t s; + if (i < 16) { + STORE_W(s, i, w, ww); + } else { + UPDATE_W(s, i, w); + } + UPDATE_T2(s, a, b, c); + UPDATE_T1(s, h, e, f, g, i, k); + UPDATE_D(s); + UPDATE_H(s); +} + +static inline void init_abcdefgh(uint32_t * xx, uint32_t n, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS]) +{ + uint8_t s; + for (s = 0; s < HASH_SEGS; s++) + xx[s] = digests[n][s]; +} + +static inline void add_abcdefgh(uint32_t * xx, uint32_t n, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS]) +{ + uint8_t s; + for (s = 0; s < HASH_SEGS; s++) + digests[n][s] += xx[s]; +} + +/* + * API to perform 0-64 steps of the multi-hash algorithm for + * a single block of data. The caller is responsible for ensuring + * a full block of data input. + * + * Argument: + * input - the pointer to the data + * digest - the space to hold the digests for all segments. + * + * Return: + * N/A + */ +void mh_sha256_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS], + uint8_t * frame_buffer) +{ + uint8_t i; + uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS]; + uint32_t ee[HASH_SEGS], ff[HASH_SEGS], gg[HASH_SEGS], hh[HASH_SEGS]; + uint32_t t1[HASH_SEGS], t2[HASH_SEGS]; + uint32_t *ww = (uint32_t *) input; + uint32_t(*w)[HASH_SEGS]; + + const static uint32_t k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + + w = (uint32_t(*)[HASH_SEGS]) frame_buffer; + + init_abcdefgh(aa, 0, digests); + init_abcdefgh(bb, 1, digests); + init_abcdefgh(cc, 2, digests); + init_abcdefgh(dd, 3, digests); + init_abcdefgh(ee, 4, digests); + init_abcdefgh(ff, 5, digests); + init_abcdefgh(gg, 6, digests); + init_abcdefgh(hh, 7, digests); + + for (i = 0; i < 64; i += 8) { + step(i, aa, bb, cc, dd, ee, ff, gg, hh, k[i], t1, t2, w, ww); + step(i + 1, hh, aa, bb, cc, dd, ee, ff, gg, k[i + 1], t1, t2, w, ww); + step(i + 2, gg, hh, aa, bb, cc, dd, ee, ff, k[i + 2], t1, t2, w, ww); + step(i + 3, ff, gg, hh, aa, bb, cc, dd, ee, k[i + 3], t1, t2, w, ww); + step(i + 4, ee, ff, gg, hh, aa, bb, cc, dd, k[i + 4], t1, t2, w, ww); + step(i + 5, dd, ee, ff, gg, hh, aa, bb, cc, k[i + 5], t1, t2, w, ww); + step(i + 6, cc, dd, ee, ff, gg, hh, aa, bb, k[i + 6], t1, t2, w, ww); + step(i + 7, bb, cc, dd, ee, ff, gg, hh, aa, k[i + 7], t1, t2, w, ww); + } + + add_abcdefgh(aa, 0, digests); + add_abcdefgh(bb, 1, digests); + add_abcdefgh(cc, 2, digests); + add_abcdefgh(dd, 3, digests); + add_abcdefgh(ee, 4, digests); + add_abcdefgh(ff, 5, digests); + add_abcdefgh(gg, 6, digests); + add_abcdefgh(hh, 7, digests); +} + +void mh_sha256_block_base(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i; + + for (i = 0; i < num_blocks; i++) { + mh_sha256_single(input_data, digests, frame_buffer); + input_data += MH_SHA256_BLOCK_SIZE; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm new file mode 100644 index 000000000..b1d6fd9ea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm @@ -0,0 +1,557 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using SSE +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define MOVPS movups + +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa %%T1,[SZ4*(%%i&0xf) + %%data] + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + %%data], %%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data] + movdqa a1, [SZ4*((%%i-2)&0xf) + %%data] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + %%data] + paddd a1, [SZ4*((%%i-7)&0xf) + %%data] + paddd %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data + +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_sse, function, internal +func(mh_sha256_block_sse) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by sse + and rsp, ~0x0F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 8 + MOVPS a, [mh_digests_p + I*64 + 16*0] + MOVPS b, [mh_digests_p + I*64 + 16*1] + MOVPS c, [mh_digests_p + I*64 + 16*2] + MOVPS d, [mh_digests_p + I*64 + 16*3] + + movdqa [rsp + I*64 + 16*0], a + movdqa [rsp + I*64 + 16*1], b + movdqa [rsp + I*64 + 16*2], c + movdqa [rsp + I*64 + 16*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4 + %assign I 0 + %rep 16 + MOVPS TT0,[mh_in_p + I*64+0*16] + MOVPS TT1,[mh_in_p + I*64+1*16] + MOVPS TT2,[mh_in_p + I*64+2*16] + MOVPS TT3,[mh_in_p + I*64+3*16] + + pshufb TT0, TMP + movdqa [mh_data_p +(I)*16 +0*256],TT0 + pshufb TT1, TMP + movdqa [mh_data_p +(I)*16 +1*256],TT1 + pshufb TT2, TMP + movdqa [mh_data_p +(I)*16 +2*256],TT2 + pshufb TT3, TMP + movdqa [mh_data_p +(I)*16 +3*256],TT3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + movdqa a, [rsp + 0*64 + mh_segs] + movdqa b, [rsp + 1*64 + mh_segs] + movdqa c, [rsp + 2*64 + mh_segs] + movdqa d, [rsp + 3*64 + mh_segs] + movdqa e, [rsp + 4*64 + mh_segs] + movdqa f, [rsp + 5*64 + mh_segs] + movdqa g, [rsp + 6*64 + mh_segs] + movdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + %if i = 48 + PREFETCH_X [mh_in_p + pref+128*1] + %endif + ROUND_16_XX T1, i, mh_data_p + %assign i (i+1) + %endrep + + ;; add old digest + paddd a, [rsp + 0*64 + mh_segs] + paddd b, [rsp + 1*64 + mh_segs] + paddd c, [rsp + 2*64 + mh_segs] + paddd d, [rsp + 3*64 + mh_segs] + paddd e, [rsp + 4*64 + mh_segs] + paddd f, [rsp + 5*64 + mh_segs] + paddd g, [rsp + 6*64 + mh_segs] + paddd h, [rsp + 7*64 + mh_segs] + + ; write out digests + movdqa [rsp + 0*64 + mh_segs], a + movdqa [rsp + 1*64 + mh_segs], b + movdqa [rsp + 2*64 + mh_segs], c + movdqa [rsp + 3*64 + mh_segs], d + movdqa [rsp + 4*64 + mh_segs], e + movdqa [rsp + 5*64 + mh_segs], f + movdqa [rsp + 6*64 + mh_segs], g + movdqa [rsp + 7*64 + mh_segs], h + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 8 + movdqa a, [rsp + I*64 + 16*0] + movdqa b, [rsp + I*64 + 16*1] + movdqa c, [rsp + I*64 + 16*2] + movdqa d, [rsp + I*64 + 16*3] + + MOVPS [mh_digests_p + I*64 + 16*0], a + MOVPS [mh_digests_p + I*64 + 16*1], b + MOVPS [mh_digests_p + I*64 + 16*2], c + MOVPS [mh_digests_p + I*64 + 16*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c new file mode 100644 index 000000000..6abb20688 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c @@ -0,0 +1,121 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha256_finalize_base.c contains the prototypes of mh_sha256_finalize_XXX + * and mh_sha256_tail_XXX. Default definitions are base type which generates + * mh_sha256_finalize_base and mh_sha256_tail_base. Other types are generated + * through different predefined macros by mh_sha256.c. + * mh_sha256_tail is used to calculate the last incomplete block of input + * data. mh_sha256_finalize is the mh_sha256_ctx wrapper of mh_sha256_tail. + */ +#ifndef MH_SHA256_FINALIZE_FUNCTION +#include +#include "mh_sha256_internal.h" + +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_base +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_base +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base +#define MH_SHA256_FINALIZE_SLVER +#endif + +void MH_SHA256_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t * frame_buffer, uint32_t digests[SHA256_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA256_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) { + MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, + 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit; + MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA256 digests (segment digests as input message) + sha256_for_mh_sha256((uint8_t *) mh_sha256_segs_digests, digests, + 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +int MH_SHA256_FINALIZE_FUNCTION(struct mh_sha256_ctx *ctx, void *mh_sha256_digest) +{ + uint8_t i; + uint8_t *partial_block_buffer; + uint64_t total_len; + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + total_len = ctx->total_length; + partial_block_buffer = ctx->partial_block_buffer; + + /* mh_sha256 tail */ + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + + MH_SHA256_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha256_segs_digests, + aligned_frame_buffer, ctx->mh_sha256_digest); + + /* Output the digests of mh_sha256 */ + if (mh_sha256_digest != NULL) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) + ((uint32_t *) mh_sha256_digest)[i] = ctx->mh_sha256_digest[i]; + } + + return MH_SHA256_CTX_ERROR_NONE; +} + +#ifdef MH_SHA256_FINALIZE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha256_finalize_base_slver_000002bb; +struct slver mh_sha256_finalize_base_slver = { 0x02bb, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h new file mode 100644 index 000000000..8051e3f36 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h @@ -0,0 +1,318 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA256_INTERNAL_H_ +#define _MH_SHA256_INTERNAL_H_ + +/** + * @file mh_sha256_internal.h + * @brief mh_sha256 internal function prototypes and macros + * + * Interface for mh_sha256 internal functions + * + */ +#include +#include "mh_sha256.h" +#include "endian_helper.h" + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef _MSC_VER +# define inline __inline +#endif + + // 64byte pointer align +#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) ) + + /******************************************************************* + *mh_sha256 constants and macros + ******************************************************************/ + /* mh_sha256 constants */ +#define MH_SHA256_H0 0x6a09e667UL +#define MH_SHA256_H1 0xbb67ae85UL +#define MH_SHA256_H2 0x3c6ef372UL +#define MH_SHA256_H3 0xa54ff53aUL +#define MH_SHA256_H4 0x510e527fUL +#define MH_SHA256_H5 0x9b05688cUL +#define MH_SHA256_H6 0x1f83d9abUL +#define MH_SHA256_H7 0x5be0cd19UL + + /* mh_sha256 macros */ +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + + /******************************************************************* + * SHA256 API internal function prototypes + ******************************************************************/ + + /** + * @brief Performs complete SHA256 algorithm. + * + * @param input Pointer to buffer containing the input message. + * @param digest Pointer to digest to update. + * @param len Length of buffer. + * @returns None + */ + void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len); + + /** + * @brief Calculate sha256 digest of blocks which size is SHA256_BLOCK_SIZE + * + * @param data Pointer to data buffer containing the input message. + * @param digest Pointer to sha256 digest. + * @returns None + */ + void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]); + + /******************************************************************* + * mh_sha256 API internal function prototypes + * Multiple versions of Update and Finalize functions are supplied which use + * multiple versions of block and tail process subfunctions. + ******************************************************************/ + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @returns none + * + */ + void mh_sha256_tail(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_base(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires SSE + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_sse(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX2 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx2(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX512 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx512(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_base(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires SSE + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX2 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX512 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm new file mode 100644 index 000000000..e14fc7eb1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm @@ -0,0 +1,77 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +%include "reg_sizes.asm" +%include "multibinary.asm" + +%ifidn __OUTPUT_FORMAT__, elf32 + [bits 32] +%else + default rel + [bits 64] + + extern mh_sha256_update_sse + extern mh_sha256_update_avx + extern mh_sha256_update_avx2 + extern mh_sha256_finalize_sse + extern mh_sha256_finalize_avx + extern mh_sha256_finalize_avx2 + + %ifdef HAVE_AS_KNOWS_AVX512 + extern mh_sha256_update_avx512 + extern mh_sha256_finalize_avx512 + %endif + +%endif + +extern mh_sha256_update_base +extern mh_sha256_finalize_base + +mbin_interface mh_sha256_update +mbin_interface mh_sha256_finalize + +%ifidn __OUTPUT_FORMAT__, elf64 + + %ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2, mh_sha256_update_avx512 + mbin_dispatch_init6 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2, mh_sha256_finalize_avx512 + %else + mbin_dispatch_init5 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2 + mbin_dispatch_init5 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2 + %endif + +%else + mbin_dispatch_init2 mh_sha256_update, mh_sha256_update_base + mbin_dispatch_init2 mh_sha256_finalize, mh_sha256_finalize_base +%endif + +;;; func core, ver, snum +slversion mh_sha256_update, 00, 00, 02b2 +slversion mh_sha256_finalize, 00, 00, 02b3 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c new file mode 100644 index 000000000..8095e4f05 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c @@ -0,0 +1,180 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha256.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_LEN 16*1024 +# define TEST_LOOPS 20000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_LEN 16*1024*1024 +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#define TEST_MEM TEST_LEN + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + if (i % 32 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("base: "); + dump((char *)hash_base, 32); + printf("ref: "); + dump((char *)hash_test, 32); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int i, fail = 0; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_base[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + struct mh_sha256_ctx *update_ctx_test = NULL, *update_ctx_base = NULL; + struct perf start, stop; + + printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n"); + + buff = malloc(TEST_LEN); + update_ctx_test = malloc(sizeof(*update_ctx_test)); + update_ctx_base = malloc(sizeof(*update_ctx_base)); + + if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + // mh_sha256 base version + mh_sha256_init(update_ctx_base); + mh_sha256_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha256_finalize_base(update_ctx_base, hash_base); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS / 10; i++) { + mh_sha256_init(update_ctx_base); + mh_sha256_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha256_finalize_base(update_ctx_base, hash_base); + } + perf_stop(&stop); + printf("mh_sha256_update_base" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + //Update feature test + CHECK_RETURN(mh_sha256_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + CHECK_RETURN(mh_sha256_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + } + perf_stop(&stop); + printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + // Check results + fail = compare_digests(hash_base, hash_test); + + if (fail) { + printf("Fail size=%d\n", TEST_LEN); + return -1; + } + + if (fail) + printf("Test failed function test%d\n", fail); + else + printf("Pass func check\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c new file mode 100644 index 000000000..2aaefecb0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c @@ -0,0 +1,410 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "mh_sha256_internal.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + // Macros and sub-functions which already exist in source code file + // (sha256_for_mh_sha256.c) is part of ISA-L library as internal functions. + // The reason why writing them twice is the linking issue caused by + // mh_sha256_ref(). mh_sha256_ref() needs these macros and sub-functions + // without linking ISA-L library. So mh_sha256_ref() includes them in + // order to contain essential sub-functions in its own object file. +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha256_single_for_mh_sha256_ref(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +void sha256_for_mh_sha256_ref(const uint8_t * input_data, uint32_t * digest, + const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = MH_SHA256_H0; + digest[1] = MH_SHA256_H1; + digest[2] = MH_SHA256_H2; + digest[3] = MH_SHA256_H3; + digest[4] = MH_SHA256_H4; + digest[5] = MH_SHA256_H5; + digest[6] = MH_SHA256_H6; + digest[7] = MH_SHA256_H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single_for_mh_sha256_ref(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - 8) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single_for_mh_sha256_ref(buf, digest); + if (i == (2 * SHA256_BLOCK_SIZE)) + sha256_single_for_mh_sha256_ref(buf + SHA256_BLOCK_SIZE, digest); +} + +/* + * buffer to rearrange one segment data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * w0 | w1 | ... | w15 + * + */ +static inline void transform_input_single(uint32_t * new_data, uint32_t * input, + uint32_t segment) +{ + new_data[16 * segment + 0] = input[16 * 0 + segment]; + new_data[16 * segment + 1] = input[16 * 1 + segment]; + new_data[16 * segment + 2] = input[16 * 2 + segment]; + new_data[16 * segment + 3] = input[16 * 3 + segment]; + new_data[16 * segment + 4] = input[16 * 4 + segment]; + new_data[16 * segment + 5] = input[16 * 5 + segment]; + new_data[16 * segment + 6] = input[16 * 6 + segment]; + new_data[16 * segment + 7] = input[16 * 7 + segment]; + new_data[16 * segment + 8] = input[16 * 8 + segment]; + new_data[16 * segment + 9] = input[16 * 9 + segment]; + new_data[16 * segment + 10] = input[16 * 10 + segment]; + new_data[16 * segment + 11] = input[16 * 11 + segment]; + new_data[16 * segment + 12] = input[16 * 12 + segment]; + new_data[16 * segment + 13] = input[16 * 13 + segment]; + new_data[16 * segment + 14] = input[16 * 14 + segment]; + new_data[16 * segment + 15] = input[16 * 15 + segment]; +} + +// Adapt parameters to sha256_single_for_mh_sha256_ref +#define sha256_update_one_seg(data, digest) \ + sha256_single_for_mh_sha256_ref((const uint8_t *)(data), (uint32_t *)(digest)) + +/* + * buffer to Rearrange all segments data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * seg0: | w0 | w1 | ... | w15 + * seg1: | w0 | w1 | ... | w15 + * seg2: | w0 | w1 | ... | w15 + * .... + * seg15: | w0 | w1 | ... | w15 + * + */ +static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block) +{ + uint32_t *current_input = input + block * MH_SHA256_BLOCK_SIZE / 4; + + transform_input_single(new_data, current_input, 0); + transform_input_single(new_data, current_input, 1); + transform_input_single(new_data, current_input, 2); + transform_input_single(new_data, current_input, 3); + transform_input_single(new_data, current_input, 4); + transform_input_single(new_data, current_input, 5); + transform_input_single(new_data, current_input, 6); + transform_input_single(new_data, current_input, 7); + transform_input_single(new_data, current_input, 8); + transform_input_single(new_data, current_input, 9); + transform_input_single(new_data, current_input, 10); + transform_input_single(new_data, current_input, 11); + transform_input_single(new_data, current_input, 12); + transform_input_single(new_data, current_input, 13); + transform_input_single(new_data, current_input, 14); + transform_input_single(new_data, current_input, 15); + +} + +/* + * buffer to Calculate all segments' digests from one block. + * + * Layout of seg_digest: + * segment + * ------------------------- + * seg0: | H0 | H1 | ... | H7 + * seg1: | H0 | H1 | ... | H7 + * seg2: | H0 | H1 | ... | H7 + * .... + * seg15: | H0 | H1 | ... | H7 + * + */ +static inline void sha256_update_all_segs(uint32_t * new_data, uint32_t(*mh_sha256_seg_digests) + [SHA256_DIGEST_WORDS]) +{ + sha256_update_one_seg(&(new_data)[16 * 0], mh_sha256_seg_digests[0]); + sha256_update_one_seg(&(new_data)[16 * 1], mh_sha256_seg_digests[1]); + sha256_update_one_seg(&(new_data)[16 * 2], mh_sha256_seg_digests[2]); + sha256_update_one_seg(&(new_data)[16 * 3], mh_sha256_seg_digests[3]); + sha256_update_one_seg(&(new_data)[16 * 4], mh_sha256_seg_digests[4]); + sha256_update_one_seg(&(new_data)[16 * 5], mh_sha256_seg_digests[5]); + sha256_update_one_seg(&(new_data)[16 * 6], mh_sha256_seg_digests[6]); + sha256_update_one_seg(&(new_data)[16 * 7], mh_sha256_seg_digests[7]); + sha256_update_one_seg(&(new_data)[16 * 8], mh_sha256_seg_digests[8]); + sha256_update_one_seg(&(new_data)[16 * 9], mh_sha256_seg_digests[9]); + sha256_update_one_seg(&(new_data)[16 * 10], mh_sha256_seg_digests[10]); + sha256_update_one_seg(&(new_data)[16 * 11], mh_sha256_seg_digests[11]); + sha256_update_one_seg(&(new_data)[16 * 12], mh_sha256_seg_digests[12]); + sha256_update_one_seg(&(new_data)[16 * 13], mh_sha256_seg_digests[13]); + sha256_update_one_seg(&(new_data)[16 * 14], mh_sha256_seg_digests[14]); + sha256_update_one_seg(&(new_data)[16 * 15], mh_sha256_seg_digests[15]); +} + +void mh_sha256_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i, j; + uint32_t *temp_buffer = (uint32_t *) frame_buffer; + uint32_t(*trans_digests)[SHA256_DIGEST_WORDS]; + + trans_digests = (uint32_t(*)[SHA256_DIGEST_WORDS]) digests; + + // Re-structure seg_digests from 5*16 to 16*5 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + temp_buffer[j * SHA256_DIGEST_WORDS + i] = digests[i][j]; + } + } + memcpy(trans_digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + // Calculate digests for all segments, leveraging sha256 API + for (i = 0; i < num_blocks; i++) { + transform_input(temp_buffer, (uint32_t *) input_data, i); + sha256_update_all_segs(temp_buffer, trans_digests); + } + + // Re-structure seg_digests from 16*5 to 5*16 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i]; + } + } + memcpy(digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha256_tail_ref(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], uint8_t * frame_buffer, + uint32_t digests[SHA256_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA256_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) { + mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit; + mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA256 digests (segment digests as input message) + sha256_for_mh_sha256_ref((uint8_t *) mh_sha256_segs_digests, digests, + 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest) +{ + uint64_t total_len; + uint64_t num_blocks; + uint32_t mh_sha256_segs_digests[SHA256_DIGEST_WORDS][HASH_SEGS]; + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE]; + uint8_t partial_block_buffer[MH_SHA256_BLOCK_SIZE * 2]; + uint32_t mh_sha256_hash_dword[SHA256_DIGEST_WORDS]; + uint32_t i; + const uint8_t *input_data = (const uint8_t *)buffer; + + /* Initialize digests of all segments */ + for (i = 0; i < HASH_SEGS; i++) { + mh_sha256_segs_digests[0][i] = MH_SHA256_H0; + mh_sha256_segs_digests[1][i] = MH_SHA256_H1; + mh_sha256_segs_digests[2][i] = MH_SHA256_H2; + mh_sha256_segs_digests[3][i] = MH_SHA256_H3; + mh_sha256_segs_digests[4][i] = MH_SHA256_H4; + mh_sha256_segs_digests[5][i] = MH_SHA256_H5; + mh_sha256_segs_digests[6][i] = MH_SHA256_H6; + mh_sha256_segs_digests[7][i] = MH_SHA256_H7; + } + + total_len = len; + + // Calculate blocks + num_blocks = len / MH_SHA256_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + mh_sha256_block_ref(input_data, mh_sha256_segs_digests, frame_buffer, + num_blocks); + len -= num_blocks * MH_SHA256_BLOCK_SIZE; + input_data += num_blocks * MH_SHA256_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + /* Finalize */ + mh_sha256_tail_ref(partial_block_buffer, total_len, mh_sha256_segs_digests, + frame_buffer, mh_sha256_hash_dword); + + // Output the digests of mh_sha256 + if (mh_sha256_digest != NULL) { + mh_sha256_digest[0] = mh_sha256_hash_dword[0]; + mh_sha256_digest[1] = mh_sha256_hash_dword[1]; + mh_sha256_digest[2] = mh_sha256_hash_dword[2]; + mh_sha256_digest[3] = mh_sha256_hash_dword[3]; + mh_sha256_digest[4] = mh_sha256_hash_dword[4]; + mh_sha256_digest[5] = mh_sha256_hash_dword[5]; + mh_sha256_digest[6] = mh_sha256_hash_dword[6]; + mh_sha256_digest[7] = mh_sha256_hash_dword[7]; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c new file mode 100644 index 000000000..13ab91c16 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c @@ -0,0 +1,217 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha256.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest); +#define MH_SHA256_REF mh_sha256_ref + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + if (i % 32 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 32); + printf("test: "); + dump((char *)hash_test, 32); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + int size, offset; + struct mh_sha256_ctx *update_ctx = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + MH_SHA256_REF(buff, TEST_LEN, hash_ref); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages + for (size = TEST_LEN; size >= 0; size--) { + + // Fill with rand data + rand_buffer(buff, size); + + MH_SHA256_REF(buff, size, hash_ref); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various buffer offsets and sizes + printf("offset tests"); + for (size = TEST_LEN - 256; size > 256; size -= 11) { + for (offset = 0; offset < 256; offset++) { + MH_SHA256_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + } + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + // Run efence tests + printf("efence tests"); + for (size = TEST_SIZE; size > 0; size--) { + offset = TEST_LEN - size; + + MH_SHA256_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + printf(" %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c new file mode 100644 index 000000000..024ae2b91 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c @@ -0,0 +1,110 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha256_update_base.c contains the prototype of mh_sha256_update_XXX. + * Default definitions are base type which generates mh_sha256_update_base. + * Other types are generated through different predefined macros by mh_sha256.c. + */ +#ifndef MH_SHA256_UPDATE_FUNCTION +#include "mh_sha256_internal.h" +#include + +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_base +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base +#define MH_SHA256_UPDATE_SLVER +#endif + +int MH_SHA256_UPDATE_FUNCTION(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len) +{ + + uint8_t *partial_block_buffer; + uint64_t partial_block_len; + uint64_t num_blocks; + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + const uint8_t *input_data = (const uint8_t *)buffer; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + if (len == 0) + return MH_SHA256_CTX_ERROR_NONE; + + partial_block_len = ctx->total_length % MH_SHA256_BLOCK_SIZE; + partial_block_buffer = ctx->partial_block_buffer; + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + + ctx->total_length += len; + // No enough input data for mh_sha256 calculation + if (len + partial_block_len < MH_SHA256_BLOCK_SIZE) { + memcpy(partial_block_buffer + partial_block_len, input_data, len); + return MH_SHA256_CTX_ERROR_NONE; + } + // mh_sha256 calculation for the previous partial block + if (partial_block_len != 0) { + memcpy(partial_block_buffer + partial_block_len, input_data, + MH_SHA256_BLOCK_SIZE - partial_block_len); + //do one_block process + MH_SHA256_BLOCK_FUNCTION(partial_block_buffer, mh_sha256_segs_digests, + aligned_frame_buffer, 1); + input_data += MH_SHA256_BLOCK_SIZE - partial_block_len; + len -= MH_SHA256_BLOCK_SIZE - partial_block_len; + memset(partial_block_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + // Calculate mh_sha256 for the current blocks + num_blocks = len / MH_SHA256_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + MH_SHA256_BLOCK_FUNCTION(input_data, mh_sha256_segs_digests, + aligned_frame_buffer, num_blocks); + len -= num_blocks * MH_SHA256_BLOCK_SIZE; + input_data += num_blocks * MH_SHA256_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + return MH_SHA256_CTX_ERROR_NONE; + +} + +#ifdef MH_SHA256_UPDATE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha256_update_base_slver_000002ba; +struct slver mh_sha256_update_base_slver = { 0x02ba, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c new file mode 100644 index 000000000..f5b28bba7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c @@ -0,0 +1,240 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "mh_sha256.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest); + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 20); + printf("test: "); + dump((char *)hash_test, 20); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0, i; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + int update_count; + int size1, size2, offset, addr_offset; + struct mh_sha256_ctx *update_ctx = NULL; + uint8_t *mem_addr = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages by update twice. + printf("\n various size messages by update twice tests"); + for (size1 = TEST_LEN; size1 >= 0; size1--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size2 = TEST_LEN - size1; // size2 is different with the former + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various update count + printf("\n various update count tests"); + for (update_count = 1; update_count <= TEST_LEN; update_count++) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size1 = TEST_LEN / update_count; + size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former + + CHECK_RETURN(mh_sha256_init(update_ctx)); + for (i = 1, offset = 0; i < update_count; i++) { + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1)); + offset += size1; + } + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // test various start address of ctx. + printf("\n various start address of ctx test"); + free(update_ctx); + mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10); + for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // a unaligned offset + update_ctx = (struct mh_sha256_ctx *)(mem_addr + addr_offset); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail addr_offset=%d\n", addr_offset); + return -1; + } + + if ((addr_offset & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c new file mode 100644 index 000000000..ea8c9f436 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c @@ -0,0 +1,176 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha256_internal.h" +#include + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA256 Functions for mh_sha256 +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = MH_SHA256_H0; + digest[1] = MH_SHA256_H1; + digest[2] = MH_SHA256_H2; + digest[3] = MH_SHA256_H3; + digest[4] = MH_SHA256_H4; + digest[5] = MH_SHA256_H5; + digest[6] = MH_SHA256_H6; + digest[7] = MH_SHA256_H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single_for_mh_sha256(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - 8) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single_for_mh_sha256(buf, digest); + if (i == (2 * SHA256_BLOCK_SIZE)) + sha256_single_for_mh_sha256(buf + SHA256_BLOCK_SIZE, digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am new file mode 100644 index 000000000..a16209248 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am @@ -0,0 +1,57 @@ +######################################################################## +# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += rolling_hash/rolling_hashx_base.c +lsrc_x86_64 += rolling_hash/rolling_hash2.c +lsrc_x86_64 += rolling_hash/rolling_hash2_until_04.asm +lsrc_x86_64 += rolling_hash/rolling_hash2_until_00.asm +lsrc_x86_64 += rolling_hash/rolling_hash2_multibinary.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_base_aliases += rolling_hash/rolling_hashx_base.c \ + rolling_hash/rolling_hash2.c \ + rolling_hash/rolling_hash2_base_aliases.c + + +lsrc_aarch64 += rolling_hash/rolling_hashx_base.c \ + rolling_hash/rolling_hash2.c \ + rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S \ + rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c \ + rolling_hash/aarch64/rolling_hash2_run_until_unroll.S + +src_include += -I $(srcdir)/rolling_hash +extern_hdrs += include/rolling_hashx.h + +other_src += rolling_hash/rolling_hash2_table.h +other_src += include/test.h include/types.h + +check_tests += rolling_hash/rolling_hash2_test +perf_tests += rolling_hash/rolling_hash2_perf +other_tests += rolling_hash/chunking_with_mb_hash diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c new file mode 100644 index 000000000..98692e162 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c @@ -0,0 +1,37 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(rolling_hash2_run_until) +{ + return PROVIDER_INFO(rolling_hash2_run_until_unroll); + + //~ return PROVIDER_BASIC(rolling_hash2_run_until); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S new file mode 100644 index 000000000..efbe44a18 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface rolling_hash2_run_until + diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S new file mode 100644 index 000000000..7ba04efbd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S @@ -0,0 +1,115 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 +/* + uint64_t rolling_hash2_run_until_unroll( + uint32_t * idx, int max_idx, uint64_t * t1, + uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h, + uint64_t mask, uint64_t trigger) +*/ + idx_addr .req x0 + max_idx .req w1 //signed int + t1_addr .req x2 + t2_addr .req x3 + b1_addr .req x4 + b2_addr .req x5 + h .req x6 + mask .req x7 + trigger .req x12 + + idx .req w8 + + dat1 .req x9 + dat2 .req x10 + wdat1 .req w9 + wdat2 .req w10 + tmp_loop .req w11 + + t1 .req x13 + t2 .req x14 + +.macro round off:req + ubfx t1,dat1,8*\off\(),8 + ubfx t2,dat2,8*\off\(),8 + ldr t1,[t1_addr,t1,lsl 3] + ldr t2,[t2_addr,t2,lsl 3] + eor t1,t2,t1 + eor h,t1,h,ror 63 + and t2,h,mask + cmp t2,trigger + beq exit_ret + add idx, idx,1 +.endm + .global rolling_hash2_run_until_unroll + .type rolling_hash2_run_until_unroll, %function +rolling_hash2_run_until_unroll: + ldr trigger,[sp] + ldr idx,[idx_addr] + sub tmp_loop,max_idx,8 + cmp idx,tmp_loop + bge unroll_loop_end +unroll_loop: + ldr dat1,[b1_addr,idx,sxtw] + ldr dat2,[b2_addr,idx,sxtw] + + round 0 + round 1 + round 2 + round 3 + round 4 + round 5 + round 6 + round 7 + cmp tmp_loop,idx + bgt unroll_loop +unroll_loop_end: + cmp idx,max_idx + bge exit_ret +loop: + ldrb wdat1,[b1_addr,idx,sxtw] + ldrb wdat2,[b2_addr,idx,sxtw] + ldr t1,[t1_addr,dat1,lsl 3] + ldr t2,[t2_addr,dat2,lsl 3] + eor t1,t2,t1 + eor h,t1,h,ror 63 + and t2,h,mask + cmp t2,trigger + beq exit_ret + add idx,idx,1 + cmp max_idx,idx + bgt loop +exit_ret: + str idx,[idx_addr] + mov x0,h + ret + .size rolling_hash2_run_until_unroll, .-rolling_hash2_run_until_unroll + diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c new file mode 100644 index 000000000..23062c3ef --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c @@ -0,0 +1,222 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "rolling_hashx.h" +#include "sha256_mb.h" +#include "test.h" + +#define MAX_BUFFER_SIZE 128*1024*1024 +#define HASH_POOL_SIZE SHA256_MAX_LANES + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define FILTER_BITS 10 +#define FILTER_SIZE (1 << FILTER_BITS) +#define FILTER_MASK (FILTER_SIZE - 1) + +#define BITS_TO_INDEX_LONG 6 +#define MASK_TO_INDEX_LONG ((1 << BITS_TO_INDEX_LONG) - 1) + +// Globals +SHA256_HASH_CTX ctxpool[SHA256_MAX_LANES], *last_ctx; +SHA256_HASH_CTX_MGR mb_hash_mgr; +uint64_t filter_table[FILTER_SIZE]; +unsigned long chunks_created = 0; +unsigned long filter_hits = 0; + +// Example function to run on each chunk + +void run_fragment(SHA256_HASH_CTX * ctx) +{ + uint64_t lookup, set_hash; + unsigned int lookup_hash; + uint32_t idx; + + chunks_created++; + + // Run a simple lookup filter on chunk using digest + lookup_hash = ctx->job.result_digest[0] & FILTER_MASK; + lookup = filter_table[lookup_hash]; + + idx = ctx->job.result_digest[1]; + + set_hash = 1 << (idx & MASK_TO_INDEX_LONG) | + 1 << ((idx >> BITS_TO_INDEX_LONG) & MASK_TO_INDEX_LONG) | + 1 << ((idx >> (2 * BITS_TO_INDEX_LONG)) & MASK_TO_INDEX_LONG); + + if ((lookup & set_hash) == set_hash) + filter_hits++; + else + filter_table[lookup_hash] = lookup | set_hash; +} + +void setup_chunk_processing(void) +{ + int i; + + sha256_ctx_mgr_init(&mb_hash_mgr); + + for (i = 0; i < HASH_POOL_SIZE; i++) + hash_ctx_init(&ctxpool[i]); + + last_ctx = &ctxpool[0]; +} + +SHA256_HASH_CTX *get_next_job_ctx(void) +{ + int i; + SHA256_HASH_CTX *ctx; + + if (last_ctx && hash_ctx_complete(last_ctx)) + return last_ctx; + + for (i = 0; i < HASH_POOL_SIZE; i++) { + if (hash_ctx_complete(&ctxpool[i])) + return &ctxpool[i]; + } + ctx = sha256_ctx_mgr_flush(&mb_hash_mgr); + assert(ctx != NULL); + return ctx; +} + +void put_next_job_ctx(SHA256_HASH_CTX * ctx) +{ + if (ctx && hash_ctx_complete(ctx)) + last_ctx = ctx; + + run_fragment(ctx); +} + +void process_chunk(uint8_t * buff, int len) +{ + SHA256_HASH_CTX *ctx; + + ctx = get_next_job_ctx(); + ctx = sha256_ctx_mgr_submit(&mb_hash_mgr, ctx, buff, len, HASH_ENTIRE); + + if (ctx) + put_next_job_ctx(ctx); +} + +void finish_chunk_processing(void) +{ + SHA256_HASH_CTX *ctx; + + while ((ctx = sha256_ctx_mgr_flush(&mb_hash_mgr)) != NULL) + run_fragment(ctx); +} + +int main(void) +{ + int i, w; + uint8_t *buffer, *p; + uint32_t mask, trigger, offset = 0; + uint32_t min_chunk, max_chunk, mean_chunk; + long remain; + struct rh_state2 state; + struct perf start, stop; + + // Chunking parameters + w = 32; + min_chunk = 1024; + mean_chunk = 4 * 1024; + max_chunk = 32 * 1024; + mask = rolling_hashx_mask_gen(mean_chunk, 0); + trigger = rand() & mask; + + printf("chunk and hash test w=%d, min=%d, target_ave=%d, max=%d:\n", w, min_chunk, + mean_chunk, max_chunk); + + if (min_chunk < w || min_chunk > max_chunk) { + printf(" Improper parameters selected\n"); + return -1; + } + + if ((buffer = malloc(MAX_BUFFER_SIZE)) == NULL) { + printf("cannot allocate mem\n"); + return -1; + } + // Initialize buffer with random data + srand(TEST_SEED); + for (i = 0; i < MAX_BUFFER_SIZE; i++) + buffer[i] = rand(); + + // Start chunking test with multi-buffer hashing of results + perf_start(&start); + + rolling_hash2_init(&state, w); + setup_chunk_processing(); + + p = buffer; + remain = MAX_BUFFER_SIZE; + + while (remain > max_chunk) { + // Skip to min chunk + rolling_hash2_reset(&state, p + min_chunk - w); + rolling_hash2_run(&state, p + min_chunk, max_chunk - min_chunk, + mask, trigger, &offset); + + process_chunk(p, min_chunk + offset); + + p += offset + min_chunk; + remain -= (offset + min_chunk); + } + + while (remain > min_chunk) { + rolling_hash2_reset(&state, p + min_chunk - w); + rolling_hash2_run(&state, p + min_chunk, remain - min_chunk, + mask, trigger, &offset); + + process_chunk(p, min_chunk + offset); + + p += offset + min_chunk; + remain -= (offset + min_chunk); + } + + if (remain > 0) + process_chunk(p, remain); + + finish_chunk_processing(); + perf_stop(&stop); + + printf("chunking_with_mb_hash: "); + perf_print(stop, start, MAX_BUFFER_SIZE); + + printf(" found %ld chunks, ave_len=%ld, filter hits=%ld\n", chunks_created, + MAX_BUFFER_SIZE / chunks_created, filter_hits); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c new file mode 100644 index 000000000..4b066e40f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c @@ -0,0 +1,169 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "rolling_hashx.h" +#include "rolling_hash2_table.h" + +extern +uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1, + uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h, + uint64_t mask, uint64_t trigger); + +int rolling_hash2_init(struct rh_state2 *state, uint32_t w) +{ + uint32_t i; + uint64_t v; + + if (w > FINGERPRINT_MAX_WINDOW) + return -1; + + for (i = 0; i < 256; i++) { + v = rolling_hash2_table1[i]; + state->table1[i] = v; + state->table2[i] = (v << w) | (v >> (64 - w)); + } + state->w = w; + return 0; +} + +void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes) +{ + uint64_t hash; + uint32_t i, w; + + hash = 0; + w = state->w; + for (i = 0; i < w; i++) { + hash = (hash << 1) | (hash >> (64 - 1)); + hash ^= state->table1[init_bytes[i]]; + } + state->hash = hash; + memcpy(state->history, init_bytes, w); +} + +static +uint64_t hash_fn(struct rh_state2 *state, uint64_t h, uint8_t new_char, uint8_t old_char) +{ + h = (h << 1) | (h >> (64 - 1)); + h ^= state->table1[new_char] ^ state->table2[old_char]; + return h; +} + +uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1, + uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h, + uint64_t mask, uint64_t trigger) +{ + int i = *idx; + + if (trigger == 0) { + for (; i < max_idx; i++) { + h = (h << 1) | (h >> (64 - 1)); + h ^= t1[b1[i]] ^ t2[b2[i]]; + if ((h & mask) == 0) { + *idx = i; + return h; + } + } + } else { + for (; i < max_idx; i++) { + h = (h << 1) | (h >> (64 - 1)); + h ^= t1[b1[i]] ^ t2[b2[i]]; + if ((h & mask) == trigger) { + *idx = i; + return h; + } + } + } + *idx = i; + return h; +} + +int +rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t buffer_length, + uint32_t mask, uint32_t trigger, uint32_t * offset) +{ + + uint32_t i; + uint32_t w = state->w; + uint64_t hash = state->hash; + + for (i = 0; i < w; i++) { + if (i == buffer_length) { + *offset = i; + // update history + memmove(state->history, state->history + i, w - i); + memcpy(state->history + w - i, buffer, i); + state->hash = hash; + return FINGERPRINT_RET_MAX; + } + hash = hash_fn(state, hash, buffer[i], state->history[i]); + + if ((hash & mask) == trigger) { + // found hit + i++; + *offset = i; + memmove(state->history, state->history + i, w - i); + memcpy(state->history + w - i, buffer, i); + state->hash = hash; + return FINGERPRINT_RET_HIT; + } + } + + hash = rolling_hash2_run_until(&i, buffer_length, state->table1, state->table2, + buffer, buffer - w, hash, mask, trigger); + if ((hash & mask) == trigger) { + // found hit + i++; + *offset = i; + memcpy(state->history, buffer + i - w, w); + state->hash = hash; + return FINGERPRINT_RET_HIT; + } + // no hit + *offset = i; + memcpy(state->history, buffer + i - w, w); + state->hash = hash; + return FINGERPRINT_RET_MAX; +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver rolling_hash2_init_slver_00000264; +struct slver rolling_hash2_init_slver = { 0x0264, 0x00, 0x00 }; + +struct slver rolling_hash2_reset_slver_00000265; +struct slver rolling_hash2_reset_slver = { 0x0265, 0x00, 0x00 }; + +struct slver rolling_hash2_run_slver_00000266; +struct slver rolling_hash2_run_slver = { 0x0266, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c new file mode 100644 index 000000000..58ee50a92 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c @@ -0,0 +1,39 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1, + uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h, + uint64_t mask, uint64_t trigger); +uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1, + uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h, + uint64_t mask, uint64_t trigger) +{ + return rolling_hash2_run_until_base(idx, max_idx, t1, t2, b1, b2, h, mask, trigger); +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm new file mode 100644 index 000000000..ad62dad74 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm @@ -0,0 +1,122 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf32 + +[bits 32] +%define def_wrd dd +%define wrd_sz dword +%define arg1 esi + +%else + +default rel +[bits 64] +%define def_wrd dq +%define wrd_sz qword +%define arg1 rsi + +extern rolling_hash2_run_until_00 +extern rolling_hash2_run_until_04 +%endif + +extern rolling_hash2_run_until_base + + +section .data +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +rolling_hash2_run_until_dispatched: + def_wrd rolling_hash2_run_until_mbinit + +section .text + +;;;; +; rolling_hash2_run_until multibinary function +;;;; +mk_global rolling_hash2_run_until, function +rolling_hash2_run_until_mbinit: + endbranch + call rolling_hash2_run_until_dispatch_init + +rolling_hash2_run_until: + jmp wrd_sz [rolling_hash2_run_until_dispatched] + +rolling_hash2_run_until_dispatch_init: + push arg1 +%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check + lea arg1, [rolling_hash2_run_until_base] +%else + push rax + push rbx + push rcx + push rdx + lea arg1, [rolling_hash2_run_until_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + lea rbx, [rolling_hash2_run_until_00 WRT_OPT] + test ecx, FLAG_CPUID1_ECX_SSE4_1 + cmovne arg1, rbx + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea rbx, [rolling_hash2_run_until_00 WRT_OPT] + + jne _done_rolling_hash2_run_until_data_init + mov rsi, rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID1_EBX_AVX2 + lea rbx, [rolling_hash2_run_until_04 WRT_OPT] + cmovne rsi, rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _done_rolling_hash2_run_until_data_init + lea rsi, [rolling_hash2_run_until_00 WRT_OPT] + +_done_rolling_hash2_run_until_data_init: + pop rdx + pop rcx + pop rbx + pop rax +%endif ;; END 32-bit check + mov [rolling_hash2_run_until_dispatched], arg1 + pop arg1 + ret diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c new file mode 100644 index 000000000..da0e0fba7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c @@ -0,0 +1,120 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include "rolling_hashx.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 100000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#ifndef FUT_run +# define FUT_run rolling_hash2_run +#endif +#ifndef FUT_init +# define FUT_init rolling_hash2_init +#endif +#ifndef FUT_reset +# define FUT_reset rolling_hash2_reset +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#ifndef TEST_SEED +# define TEST_SEED 0x123f +#endif + +#define TEST_MEM TEST_LEN + +int main(int argc, char *argv[]) +{ + uint8_t *buf; + uint32_t mask, trigger, offset = 0; + int i, w, ret; + long long run_length; + struct rh_state2 *state; + struct perf start, stop; + + // Case + w = 32; + mask = 0xffffffff; + trigger = 0x123; + + printf(xstr(FUT_run) "_perf:\n"); + + buf = malloc(TEST_LEN); + if (buf == NULL) { + printf("alloc error: Fail\n"); + return -1; + } + if (posix_memalign((void **)&state, 64, sizeof(struct rh_state2))) { + printf("alloc error rh_state: Fail\n");; + return -1; + } + + srand(TEST_SEED); + + for (i = 0; i < TEST_LEN; i++) + buf[i] = rand(); + + printf("Start timed tests\n"); + fflush(0); + + FUT_init(state, w); + FUT_reset(state, buf); + ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset); + } + perf_stop(&stop); + + run_length = (ret == FINGERPRINT_RET_HIT) ? offset : TEST_LEN; + printf(" returned %d after %lld B\n", ret, run_length); + printf(xstr(FUT_run) TEST_TYPE_STR ": "); + perf_print(stop, start, run_length * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h new file mode 100644 index 000000000..366f26374 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h @@ -0,0 +1,296 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _ROLLING_HASH2_TABLE_H_ +#define _ROLLING_HASH2_TABLE_H_ + +// values are fractional part of pi +// taken from: +// http://www.herongyang.com/Cryptography/Blowfish-First-8366-Hex-Digits-of-PI.html +// taken from source code of BlowfishJ + +uint64_t rolling_hash2_table1[256] = { + 0x243F6A8885A308D3, + 0x13198A2E03707344, + 0xA4093822299F31D0, + 0x082EFA98EC4E6C89, + 0x452821E638D01377, + 0xBE5466CF34E90C6C, + 0xC0AC29B7C97C50DD, + 0x3F84D5B5B5470917, + 0x9216D5D98979FB1B, + 0xD1310BA698DFB5AC, + 0x2FFD72DBD01ADFB7, + 0xB8E1AFED6A267E96, + 0xBA7C9045F12C7F99, + 0x24A19947B3916CF7, + 0x0801F2E2858EFC16, + 0x636920D871574E69, + 0xA458FEA3F4933D7E, + 0x0D95748F728EB658, + 0x718BCD5882154AEE, + 0x7B54A41DC25A59B5, + 0x9C30D5392AF26013, + 0xC5D1B023286085F0, + 0xCA417918B8DB38EF, + 0x8E79DCB0603A180E, + 0x6C9E0E8BB01E8A3E, + 0xD71577C1BD314B27, + 0x78AF2FDA55605C60, + 0xE65525F3AA55AB94, + 0x5748986263E81440, + 0x55CA396A2AAB10B6, + 0xB4CC5C341141E8CE, + 0xA15486AF7C72E993, + 0xB3EE1411636FBC2A, + 0x2BA9C55D741831F6, + 0xCE5C3E169B87931E, + 0xAFD6BA336C24CF5C, + 0x7A32538128958677, + 0x3B8F48986B4BB9AF, + 0xC4BFE81B66282193, + 0x61D809CCFB21A991, + 0x487CAC605DEC8032, + 0xEF845D5DE98575B1, + 0xDC262302EB651B88, + 0x23893E81D396ACC5, + 0x0F6D6FF383F44239, + 0x2E0B4482A4842004, + 0x69C8F04A9E1F9B5E, + 0x21C66842F6E96C9A, + 0x670C9C61ABD388F0, + 0x6A51A0D2D8542F68, + 0x960FA728AB5133A3, + 0x6EEF0B6C137A3BE4, + 0xBA3BF0507EFB2A98, + 0xA1F1651D39AF0176, + 0x66CA593E82430E88, + 0x8CEE8619456F9FB4, + 0x7D84A5C33B8B5EBE, + 0xE06F75D885C12073, + 0x401A449F56C16AA6, + 0x4ED3AA62363F7706, + 0x1BFEDF72429B023D, + 0x37D0D724D00A1248, + 0xDB0FEAD349F1C09B, + 0x075372C980991B7B, + 0x25D479D8F6E8DEF7, + 0xE3FE501AB6794C3B, + 0x976CE0BD04C006BA, + 0xC1A94FB6409F60C4, + 0x5E5C9EC2196A2463, + 0x68FB6FAF3E6C53B5, + 0x1339B2EB3B52EC6F, + 0x6DFC511F9B30952C, + 0xCC814544AF5EBD09, + 0xBEE3D004DE334AFD, + 0x660F2807192E4BB3, + 0xC0CBA85745C8740F, + 0xD20B5F39B9D3FBDB, + 0x5579C0BD1A60320A, + 0xD6A100C6402C7279, + 0x679F25FEFB1FA3CC, + 0x8EA5E9F8DB3222F8, + 0x3C7516DFFD616B15, + 0x2F501EC8AD0552AB, + 0x323DB5FAFD238760, + 0x53317B483E00DF82, + 0x9E5C57BBCA6F8CA0, + 0x1A87562EDF1769DB, + 0xD542A8F6287EFFC3, + 0xAC6732C68C4F5573, + 0x695B27B0BBCA58C8, + 0xE1FFA35DB8F011A0, + 0x10FA3D98FD2183B8, + 0x4AFCB56C2DD1D35B, + 0x9A53E479B6F84565, + 0xD28E49BC4BFB9790, + 0xE1DDF2DAA4CB7E33, + 0x62FB1341CEE4C6E8, + 0xEF20CADA36774C01, + 0xD07E9EFE2BF11FB4, + 0x95DBDA4DAE909198, + 0xEAAD8E716B93D5A0, + 0xD08ED1D0AFC725E0, + 0x8E3C5B2F8E7594B7, + 0x8FF6E2FBF2122B64, + 0x8888B812900DF01C, + 0x4FAD5EA0688FC31C, + 0xD1CFF191B3A8C1AD, + 0x2F2F2218BE0E1777, + 0xEA752DFE8B021FA1, + 0xE5A0CC0FB56F74E8, + 0x18ACF3D6CE89E299, + 0xB4A84FE0FD13E0B7, + 0x7CC43B81D2ADA8D9, + 0x165FA26680957705, + 0x93CC7314211A1477, + 0xE6AD206577B5FA86, + 0xC75442F5FB9D35CF, + 0xEBCDAF0C7B3E89A0, + 0xD6411BD3AE1E7E49, + 0x00250E2D2071B35E, + 0x226800BB57B8E0AF, + 0x2464369BF009B91E, + 0x5563911D59DFA6AA, + 0x78C14389D95A537F, + 0x207D5BA202E5B9C5, + 0x832603766295CFA9, + 0x11C819684E734A41, + 0xB3472DCA7B14A94A, + 0x1B5100529A532915, + 0xD60F573FBC9BC6E4, + 0x2B60A47681E67400, + 0x08BA6FB5571BE91F, + 0xF296EC6B2A0DD915, + 0xB6636521E7B9F9B6, + 0xFF34052EC5855664, + 0x53B02D5DA99F8FA1, + 0x08BA47996E85076A, + 0x4B7A70E9B5B32944, + 0xDB75092EC4192623, + 0xAD6EA6B049A7DF7D, + 0x9CEE60B88FEDB266, + 0xECAA8C71699A17FF, + 0x5664526CC2B19EE1, + 0x193602A575094C29, + 0xA0591340E4183A3E, + 0x3F54989A5B429D65, + 0x6B8FE4D699F73FD6, + 0xA1D29C07EFE830F5, + 0x4D2D38E6F0255DC1, + 0x4CDD20868470EB26, + 0x6382E9C6021ECC5E, + 0x09686B3F3EBAEFC9, + 0x3C9718146B6A70A1, + 0x687F358452A0E286, + 0xB79C5305AA500737, + 0x3E07841C7FDEAE5C, + 0x8E7D44EC5716F2B8, + 0xB03ADA37F0500C0D, + 0xF01C1F040200B3FF, + 0xAE0CF51A3CB574B2, + 0x25837A58DC0921BD, + 0xD19113F97CA92FF6, + 0x9432477322F54701, + 0x3AE5E58137C2DADC, + 0xC8B576349AF3DDA7, + 0xA94461460FD0030E, + 0xECC8C73EA4751E41, + 0xE238CD993BEA0E2F, + 0x3280BBA1183EB331, + 0x4E548B384F6DB908, + 0x6F420D03F60A04BF, + 0x2CB8129024977C79, + 0x5679B072BCAF89AF, + 0xDE9A771FD9930810, + 0xB38BAE12DCCF3F2E, + 0x5512721F2E6B7124, + 0x501ADDE69F84CD87, + 0x7A5847187408DA17, + 0xBC9F9ABCE94B7D8C, + 0xEC7AEC3ADB851DFA, + 0x63094366C464C3D2, + 0xEF1C18473215D908, + 0xDD433B3724C2BA16, + 0x12A14D432A65C451, + 0x50940002133AE4DD, + 0x71DFF89E10314E55, + 0x81AC77D65F11199B, + 0x043556F1D7A3C76B, + 0x3C11183B5924A509, + 0xF28FE6ED97F1FBFA, + 0x9EBABF2C1E153C6E, + 0x86E34570EAE96FB1, + 0x860E5E0A5A3E2AB3, + 0x771FE71C4E3D06FA, + 0x2965DCB999E71D0F, + 0x803E89D65266C825, + 0x2E4CC9789C10B36A, + 0xC6150EBA94E2EA78, + 0xA5FC3C531E0A2DF4, + 0xF2F74EA7361D2B3D, + 0x1939260F19C27960, + 0x5223A708F71312B6, + 0xEBADFE6EEAC31F66, + 0xE3BC4595A67BC883, + 0xB17F37D1018CFF28, + 0xC332DDEFBE6C5AA5, + 0x6558218568AB9802, + 0xEECEA50FDB2F953B, + 0x2AEF7DAD5B6E2F84, + 0x1521B62829076170, + 0xECDD4775619F1510, + 0x13CCA830EB61BD96, + 0x0334FE1EAA0363CF, + 0xB5735C904C70A239, + 0xD59E9E0BCBAADE14, + 0xEECC86BC60622CA7, + 0x9CAB5CABB2F3846E, + 0x648B1EAF19BDF0CA, + 0xA02369B9655ABB50, + 0x40685A323C2AB4B3, + 0x319EE9D5C021B8F7, + 0x9B540B19875FA099, + 0x95F7997E623D7DA8, + 0xF837889A97E32D77, + 0x11ED935F16681281, + 0x0E358829C7E61FD6, + 0x96DEDFA17858BA99, + 0x57F584A51B227263, + 0x9B83C3FF1AC24696, + 0xCDB30AEB532E3054, + 0x8FD948E46DBC3128, + 0x58EBF2EF34C6FFEA, + 0xFE28ED61EE7C3C73, + 0x5D4A14D9E864B7E3, + 0x42105D14203E13E0, + 0x45EEE2B6A3AAABEA, + 0xDB6C4F15FACB4FD0, + 0xC742F442EF6ABBB5, + 0x654F3B1D41CD2105, + 0xD81E799E86854DC7, + 0xE44B476A3D816250, + 0xCF62A1F25B8D2646, + 0xFC8883A0C1C7B6A3, + 0x7F1524C369CB7492, + 0x47848A0B5692B285, + 0x095BBF00AD19489D, + 0x1462B17423820E00, + 0x58428D2A0C55F5EA, + 0x1DADF43E233F7061, + 0x3372F0928D937E41, + 0xD65FECF16C223BDB, + 0x7CDE3759CBEE7460, + 0x4085F2A7CE77326E, + 0xA607808419F8509E, + 0xE8EFD85561D99735, + 0xA969A7AAC50C06C2, +}; +#endif // _ROLLING_HASH2_TABLE_H_ diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c new file mode 100644 index 000000000..ee45c120d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c @@ -0,0 +1,314 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "rolling_hashx.h" + +#ifndef FUT_run +# define FUT_run rolling_hash2_run +#endif +#ifndef FUT_init +# define FUT_init rolling_hash2_init +#endif +#ifndef FUT_reset +# define FUT_reset rolling_hash2_reset +#endif +#ifndef FUT_ref +# define FUT_ref rolling_hash2_ref +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define MAX_BUFFER_SIZE 128*1024*1024 +#define MAX_ROLLING_HASH_WIDTH 32 + +#ifndef RANDOMS +# define RANDOMS 200 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static +uint64_t rolling_hash2_ref(struct rh_state2 *state, unsigned char *p, int len, + uint64_t hash_init) +{ + int i; + uint64_t h = hash_init; + + for (i = 0; i < len; i++) { + h = (h << 1) | (h >> (64 - 1)); + h ^= state->table1[*p++]; + } + return h; +} + +int ones_in_mask(uint32_t in) +{ + int count; + + for (count = 0; in != 0; in &= (in - 1)) + count++; + + return count; +} + +/* + * Utility function to pick a random mask. Not uniform in number of bits. + */ +uint32_t pick_rand_mask_in_range(int min_bits, int max_bits) +{ + uint32_t mask = 0; + int ones; + + do { + mask = rand(); +#if defined(_WIN32) || defined(_WIN64) + mask = (mask << 16) ^ rand(); +#endif + ones = ones_in_mask(mask); + } while (ones < min_bits || ones > max_bits); + + return mask; +} + +int main(void) +{ + uint8_t *buffer; + uint64_t hash; + uint32_t mask, trigger, offset = 0; + int i, w, r, ret, max, errors = 0; + uint32_t offset_fut; + struct rh_state2 state; + + printf(xstr(FUT_run) ": " xstr(MAX_BUFFER_SIZE)); + + buffer = malloc(MAX_BUFFER_SIZE); + if (buffer == NULL) { + printf("cannot allocate mem\n"); + return -1; + } + srand(TEST_SEED); + + // Test case 1, compare trigger case at boundary with reference hash + w = 32; + mask = 0xffff0; + trigger = 0x3df0; + trigger &= mask; + + for (i = 0; i < MAX_BUFFER_SIZE; i++) + buffer[i] = rand(); + + FUT_init(&state, w); + FUT_reset(&state, buffer); + + uint8_t *p = buffer; + int remain = MAX_BUFFER_SIZE; + ret = FINGERPRINT_RET_HIT; + + while ((ret == FINGERPRINT_RET_HIT) && (remain > 0)) { + ret = FUT_run(&state, p, remain, mask, trigger, &offset); + + if (offset > remain) { + printf(" error offset past remaining limit\n"); + errors++; + } + + if ((ret == FINGERPRINT_RET_HIT) && (&p[offset] > &buffer[w])) { + hash = FUT_ref(&state, &p[offset] - w, w, 0); + if ((hash & mask) != trigger) { + printf(" mismatch chunk from ref"); + printf(" hit: offset=%d %lx %lx\n", offset, state.hash, hash); + errors++; + } + } + p += offset; + remain -= offset; + putchar('.'); + } + + putchar('.'); // Finished test 1 + + // Test case 2, check if reference function hits same chunk boundary as test + + w = 32; + mask = 0xffff; + trigger = rand(); + trigger &= mask; + p = buffer; + + // Function under test + FUT_init(&state, w); + FUT_reset(&state, p); + ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut); + offset_fut += w; + + // Reference + for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) { + hash = FUT_ref(&state, p++, w, 0); + if ((hash & mask) == trigger) + break; + } + + if (offset != offset_fut) { + printf("\ncase 2, offset of chunk different from ref\n"); + printf(" case 2: stop fut at offset=%d\n", offset_fut); + printf(" case 2: stop ref at offset=%d\n", offset); + errors++; + return errors; + } + putchar('.'); // Finished test 2 + + // Do case 2 above with random args + + for (r = 0; r < RANDOMS; r++) { + w = rand() % MAX_ROLLING_HASH_WIDTH; + if (w < 3) + continue; + + mask = pick_rand_mask_in_range(4, 20); + trigger = rand() & mask; + p = buffer; + + // Function under test + FUT_init(&state, w); + FUT_reset(&state, p); + ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut); + offset_fut += w; + + // Reference + for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) { + hash = FUT_ref(&state, p++, w, 0); + if ((hash & mask) == trigger) + break; + } + + if (offset != offset_fut) { + printf("\nrand case 2 #%d: w=%d, mask=0x%x, trigger=0x%x\n", r, w, + mask, trigger); + printf(" offset of chunk different from ref\n"); + printf(" case 2r: stop fut at offset=%d\n", offset_fut); + printf(" case 2r: stop ref at offset=%d\n", offset); + errors++; + return errors; + } + putchar('.'); + } + + // Test case 3, check if max bound is same + + w = 32; + mask = 0xfffff; + trigger = rand(); + trigger &= mask; + putchar('|'); + + for (max = w + 1; max < 500; max++) { + p = buffer; + FUT_init(&state, w); + FUT_reset(&state, p); + + ret = FUT_run(&state, p + w, max - w, mask, trigger, &offset_fut); + offset_fut += w; + + int ret_ref = FINGERPRINT_RET_MAX; + for (p++, offset = w + 1; offset < max; offset++) { + hash = FUT_ref(&state, p++, w, 0); + if ((hash & mask) == trigger) { + ret_ref = FINGERPRINT_RET_HIT; + break; + } + } + + if (offset != offset_fut || ret != ret_ref) { + printf("\ncase 3 max=%d, offset of chunk different from ref\n", max); + printf(" case 3: stop fut at offset=%d\n", offset_fut); + printf(" case 3: stop ref at offset=%d\n", offset); + printf(" case 3: ret_fut=%d ret_ref=%d\n", ret, ret_ref); + errors++; + return errors; + } + putchar('.'); // Finished test 3 + } + + // Test case 4, check if max bound is same under random params + + for (r = 0; r < RANDOMS; r++) { + p = buffer; + mask = pick_rand_mask_in_range(24, 30); // Pick an unlikely mask + trigger = rand() & mask; + w = rand() % MAX_ROLLING_HASH_WIDTH; + max = rand() % 1024; + + if (w < 3 || max < 2 * MAX_ROLLING_HASH_WIDTH) + continue; + + FUT_init(&state, w); + FUT_reset(&state, p); + + ret = FUT_run(&state, p, max, mask, trigger, &offset_fut); + + if (offset_fut <= w) + continue; + + int ret_ref = FINGERPRINT_RET_MAX; + for (p++, offset = w + 1; offset < max; offset++) { + hash = FUT_ref(&state, p++, w, 0); + if ((hash & mask) == trigger) { + ret_ref = FINGERPRINT_RET_HIT; + break; + } + } + + if (offset != offset_fut || ret != ret_ref) { + printf("\ncase 4 rand case different from ref, max=%d w=%d\n", max, w); + printf(" case 4: stop fut at offset=%d\n", offset_fut); + printf(" case 4: stop ref at offset=%d\n", offset); + printf(" case 4: ret_fut=%d ret_ref=%d\n", ret, ret_ref); + errors++; + return errors; + } + putchar('.'); // Finished test 4 + + if (ret == FINGERPRINT_RET_HIT) { + p[-1] = rand(); // Keep hits from repeating + } + } + + if (errors > 0) + printf(" Fail: %d\n", errors); + else + printf(" Pass\n"); + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm new file mode 100644 index 000000000..99091faa4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm @@ -0,0 +1,204 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; uint64_t rolling_hash2_run_until_00(uint32_t *idx, uint32_t buffer_length, uint64_t *t1, +;;; uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask, +;;; uint64_t trigger) + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define arg6 r10 + %define arg7 r11 + %define arg8 r12 ; must be saved and loaded + %define tmp1 rbp ; must be saved and loaded + %define tmp2 rbx ; must be saved and loaded + %define tmp3 r13 ; must be saved and loaded + %define tmp4 r14 ; must be saved and loaded + %define tmp5 r15 ; must be saved and loaded + %define return rax + %define PS 8 + %define frame_size 6*8 + %define arg(x) [rsp + frame_size + PS + PS*x] + + %define func(x) x: + %macro FUNC_SAVE 0 + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov arg6, arg(0) + mov arg7, arg(1) + mov arg8, arg(2) + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved and loaded + %define arg5 r13 ; must be saved and loaded + %define arg6 r14 ; must be saved and loaded + %define arg7 r15 ; must be saved and loaded + %define arg8 rbx ; must be saved and loaded + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 rdi ; must be saved and loaded + %define tmp4 rsi ; must be saved and loaded + %define tmp5 rbp ; must be saved and loaded + %define return rax + %define PS 8 + %define frame_size 8*8 + %define arg(x) [rsp + frame_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + push_reg r12 + push_reg r13 + push_reg r14 + push_reg r15 + push_reg rbx + push_reg rdi + push_reg rsi + push_reg rbp + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + mov arg6, arg(6) + mov arg7, arg(7) + mov arg8, arg(8) + %endmacro + + %macro FUNC_RESTORE 0 + pop rbp + pop rsi + pop rdi + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +%define idx arg0 +%define max arg1 +%define t1 arg2 +%define t2 arg3 +%define b1 arg4 +%define b2 arg5 +%define hash arg6 +%define mask arg7 +%define trigger arg8 + +%define pos rax +%define pos.w eax +%define x tmp2 +%define y tmp3 +%define z tmp4 +%define h tmp1 +%define a tmp5 + +default rel +[bits 64] +section .text + +align 16 +mk_global rolling_hash2_run_until_00, function +func(rolling_hash2_run_until_00) + endbranch + FUNC_SAVE + mov pos.w, dword [idx] + sub max, 2 + cmp pos, max + jg .less_than_2 + +.loop2: ror hash, 0x3f + movzx x, byte [b1 + pos] + movzx a, byte [b1 + pos + 1] + movzx y, byte [b2 + pos] + movzx h, byte [b2 + pos + 1] + mov z, [t1 + x * 8] + xor z, [t2 + y * 8] + xor hash, z + mov x, hash + and x, mask + cmp x, trigger + je .ret_0 + + ror hash, 0x3f + mov z, [t1 + a * 8] + xor z, [t2 + h * 8] + xor hash, z + mov y, hash + and y, mask + cmp y, trigger + je .ret_1 + + add pos, 2 + cmp pos, max + jle .loop2 + +.less_than_2: + add max, 1 + cmp pos, max + jg .ret_0 + ror hash, 0x3f + movzx x, byte [b1 + pos] + movzx y, byte [b2 + pos] + mov z, [t1 + x * 8] + xor z, [t2 + y * 8] + xor hash, z +.ret_1: add pos, 1 +.ret_0: mov dword [idx], pos.w + mov rax, hash + FUNC_RESTORE + ret + +endproc_frame + +section .data diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm new file mode 100644 index 000000000..3f4e8353b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm @@ -0,0 +1,203 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; uint64_t rolling_hash2_run_until_04(uint32_t *idx, uint32_t max_idx, uint64_t *t1, +;;; uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask, +;;; uint64_t trigger) + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define arg6 r10 + %define arg7 r11 + %define arg8 r12 ; must be saved and loaded + %define tmp1 rbp ; must be saved and loaded + %define tmp2 rbx ; must be saved and loaded + %define tmp3 r13 ; must be saved and loaded + %define tmp4 r14 ; must be saved and loaded + %define tmp5 r15 ; must be saved and loaded + %define return rax + %define PS 8 + %define frame_size 6*8 + %define arg(x) [rsp + frame_size + PS + PS*x] + + %define func(x) x: + %macro FUNC_SAVE 0 + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov arg6, arg(0) + mov arg7, arg(1) + mov arg8, arg(2) + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved and loaded + %define arg5 r13 ; must be saved and loaded + %define arg6 r14 ; must be saved and loaded + %define arg7 r15 ; must be saved and loaded + %define arg8 rbx ; must be saved and loaded + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 rdi ; must be saved and loaded + %define tmp4 rsi ; must be saved and loaded + %define tmp5 rbp ; must be saved and loaded + %define return rax + %define PS 8 + %define frame_size 8*8 + %define arg(x) [rsp + frame_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + push_reg r12 + push_reg r13 + push_reg r14 + push_reg r15 + push_reg rbx + push_reg rdi + push_reg rsi + push_reg rbp + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + mov arg6, arg(6) + mov arg7, arg(7) + mov arg8, arg(8) + %endmacro + + %macro FUNC_RESTORE 0 + pop rbp + pop rsi + pop rdi + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +%define idx arg0 +%define max arg1 +%define t1 arg2 +%define t2 arg3 +%define b1 arg4 +%define b2 arg5 +%define hash arg6 +%define mask arg7 +%define trigger arg8 + +%define pos rax +%define pos.w eax +%define x tmp2 +%define y tmp3 +%define z tmp4 +%define h tmp1 +%define a tmp5 + +default rel +[bits 64] +section .text + +align 16 +mk_global rolling_hash2_run_until_04, function +func(rolling_hash2_run_until_04) + endbranch + FUNC_SAVE + mov pos.w, dword [idx] + pext trigger, trigger, mask + sub max, 2 + cmp pos, max + jg .less_than_2 + +.loop2: rorx hash, hash, 0x3f + movzx x, byte [b1 + pos] + movzx a, byte [b1 + pos + 1] + movzx y, byte [b2 + pos] + movzx h, byte [b2 + pos + 1] + mov z, [t1 + x * 8] + xor z, [t2 + y * 8] + xor hash, z + pext x, hash, mask + cmp x, trigger + je .ret_0 + + rorx hash, hash, 0x3f + mov z, [t1 + a * 8] + xor z, [t2 + h * 8] + xor hash, z + pext y, hash, mask + cmp y, trigger + je .ret_1 + + add pos, 2 + cmp pos, max + jle .loop2 + +.less_than_2: + add max, 1 + cmp pos, max + jg .ret_0 + rorx hash, hash, 0x3f + movzx x, byte [b1 + pos] + movzx y, byte [b2 + pos] + mov z, [t1 + x * 8] + xor z, [t2 + y * 8] + xor hash, z +.ret_1: add pos, 1 +.ret_0: mov dword [idx], pos.w + mov rax, hash + FUNC_RESTORE + ret + +endproc_frame + +section .data diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c new file mode 100644 index 000000000..4197def0e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#ifdef _MSC_VER +# define inline __inline +#endif + +inline int floor_pow2(uint32_t in) +{ + uint32_t x = in; + + while (in) { + x = in; + in &= (in - 1); + } + return x; +} + +inline uint32_t rol(uint32_t x, int i) +{ + return x << i | x >> (8 * sizeof(x) - i); +} + +uint32_t rolling_hashx_mask_gen(long mean, int shift) +{ + if (mean <= 2) + mean = 2; + + return rol(floor_pow2(mean) - 1, shift); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver rolling_hashx_mask_gen_slver_00000260; +struct slver rolling_hashx_mask_gen_slver = { 0x0260, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am new file mode 100644 index 000000000..3f3c589ad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am @@ -0,0 +1,130 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sha1_mb/sha1_ctx_sse.c \ + sha1_mb/sha1_ctx_avx.c \ + sha1_mb/sha1_ctx_avx2.c \ + sha1_mb/sha1_ctx_base.c + +lsrc_x86_64 += sha1_mb/sha1_mb_mgr_init_sse.c \ + sha1_mb/sha1_mb_mgr_init_avx2.c + +lsrc_x86_64 += sha1_mb/sha1_mb_mgr_submit_sse.asm \ + sha1_mb/sha1_mb_mgr_submit_avx.asm \ + sha1_mb/sha1_mb_mgr_submit_avx2.asm \ + sha1_mb/sha1_mb_mgr_flush_sse.asm \ + sha1_mb/sha1_mb_mgr_flush_avx.asm \ + sha1_mb/sha1_mb_mgr_flush_avx2.asm \ + sha1_mb/sha1_mb_x4_sse.asm \ + sha1_mb/sha1_mb_x4_avx.asm \ + sha1_mb/sha1_mb_x8_avx2.asm \ + sha1_mb/sha1_multibinary.asm + +lsrc_x86_64 += sha1_mb/sha1_ctx_avx512.c \ + sha1_mb/sha1_mb_mgr_init_avx512.c \ + sha1_mb/sha1_mb_mgr_submit_avx512.asm \ + sha1_mb/sha1_mb_mgr_flush_avx512.asm \ + sha1_mb/sha1_mb_x16_avx512.asm + +lsrc_x86_64 += sha1_mb/sha1_opt_x1.asm + +lsrc_x86_64 += sha1_mb/sha1_ni_x1.asm \ + sha1_mb/sha1_ni_x2.asm \ + sha1_mb/sha1_ctx_sse_ni.c \ + sha1_mb/sha1_ctx_avx512_ni.c \ + sha1_mb/sha1_mb_mgr_submit_sse_ni.asm \ + sha1_mb/sha1_mb_mgr_flush_sse_ni.asm \ + sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += sha1_mb/sha1_ctx_base.c \ + sha1_mb/sha1_ref.c \ + sha1_mb/aarch64/sha1_mb_multibinary.S \ + sha1_mb/aarch64/sha1_ctx_ce.c \ + sha1_mb/aarch64/sha1_mb_x1_ce.S \ + sha1_mb/aarch64/sha1_mb_x2_ce.S \ + sha1_mb/aarch64/sha1_mb_mgr_ce.c \ + sha1_mb/aarch64/sha1_ctx_asimd.c \ + sha1_mb/aarch64/sha1_aarch64_x1.S \ + sha1_mb/aarch64/sha1_mb_asimd_x4.S \ + sha1_mb/aarch64/sha1_mb_mgr_asimd.c \ + sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c + + + +lsrc_base_aliases += sha1_mb/sha1_ctx_base_aliases.c \ + sha1_mb/sha1_ctx_base.c \ + sha1_mb/sha1_ref.c + +src_include += -I $(srcdir)/sha1_mb + +extern_hdrs += include/sha1_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + sha1_mb/sha1_job.asm \ + sha1_mb/sha1_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha1_mb/sha1_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha1_mb/sha1_mb_test \ + sha1_mb/sha1_mb_rand_test \ + sha1_mb/sha1_mb_rand_update_test \ + sha1_mb/sha1_mb_flush_test + +unit_tests += sha1_mb/sha1_mb_rand_ssl_test + +perf_tests += sha1_mb/sha1_mb_vs_ossl_perf \ + sha1_mb/sha1_mb_vs_ossl_shortage_perf + +examples += sha1_mb/sha1_multi_buffer_example + + +sha1_mb_rand_test: sha1_ref.o +sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_rand_update_test: sha1_ref.o +sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_flush_test: sha1_ref.o +sha1_mb_sha1_mb_flush_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_rand_ssl_test: LDLIBS += -lcrypto +sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sha1_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sha1_mb_sha1_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S new file mode 100644 index 000000000..55d6f932f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S @@ -0,0 +1,294 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + + input_data .req x0 + num_blocks .req w1 + digest .req x2 + + // x2 is reused intentionally between digest/tmp + // due to running out of registers + TMP .req x2 + TMPW .req w2 + sha1key_adr .req x3 + WK .req w3 + WF .req w4 + WA .req w5 + WB .req w6 + WC .req w7 + WD .req w8 + WE .req w9 + WORD0 .req w10 + WORD1 .req w11 + WORD2 .req w12 + WORD3 .req w13 + WORD4 .req w14 + WORD5 .req w15 + WORD6 .req w16 + WORD7 .req w17 + WORD8 .req w18 + WORD9 .req w19 + WORD10 .req w20 + WORD11 .req w21 + WORD12 .req w22 + WORD13 .req w23 + WORD14 .req w24 + WORD15 .req w25 + AA .req w26 + BB .req w27 + CC .req w28 + DD .req w29 + EE .req w30 + + TT .req w0 + +.macro save_stack + stp x16,x17,[sp, -128]! + stp x18,x19,[sp, 16] + stp x20,x21,[sp, 32] + stp x22,x23,[sp, 48] + stp x24,x25,[sp, 64] + stp x26,x27,[sp, 80] + stp x28,x29,[sp, 96] + str x30,[sp, 112] + // have to reuse x2, which is digest address + str x2,[sp, 120] +.endm + +.macro restore_stack + ldp x18,x19,[sp, 16] + ldp x20,x21,[sp, 32] + ldp x22,x23,[sp, 48] + ldp x24,x25,[sp, 64] + ldp x26,x27,[sp, 80] + ldp x28,x29,[sp, 96] + ldr x30,[sp, 112] + ldr x2,[sp, 120] + ldp x16,x17,[sp],128 +.endm +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor WF, WC, WD + and WF, WB, WF + eor WF, WD, WF +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor WF, WB, WC + eor WF, WF, WD +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and TMPW, WB, WC + and WF, WB, WD + orr WF, WF, TMPW + and TMPW, WC, WD + orr WF, WF, TMPW +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_word_at \windex + .endif +.endm + +.macro SHA1_STEP_00_15 windex:req + rev WORD\windex\(),WORD\windex\() + next_word=\windex+1 + load_next_word %next_word + + ror TMPW,WA,#32-5 + add WE,WE,TMPW + add WE,WE,WK + FUNC_F0 + ror WB,WB,#32-30 + add WE,WE,WORD\windex\() + add WE,WE,WF +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor TMPW,\reg_14,\reg_8 + eor \reg_16,\reg_16,\reg_3 + eor \reg_16,\reg_16,TMPW + + ror TMPW,WA,#32-5 + ror \reg_16,\reg_16, #32 - 1 + + add WE,WE,TMPW + add WE,WE,WK + \func_f + ror WB,WB,#32-30 + add WE,WE,\reg_16 + add WE,WE,WF +.endm + +.macro SWAP_STATES + .unreq TT + TT .req WE + .unreq WE + WE .req WD + .unreq WD + WD .req WC + .unreq WC + WC .req WB + .unreq WB + WB .req WA + .unreq WA + WA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.altmacro + +.macro load_two_words_at idx0:req,idx1:req + ldp WORD\idx0\(),WORD\idx1\(),[input_data],8 +.endm + +.macro load_word_at idx:req + .if \idx % 2 == 0 + idx1=\idx+1 + load_two_words_at \idx,%idx1 + .endif +.endm + +/* + * void sha1_aarch64_x1(uint32_t *input_data, int num_blocks, uint32_t digest[5]) + */ + .global sha1_aarch64_x1 + .type sha1_aarch64_x1, %function +sha1_aarch64_x1: + cmp num_blocks, #0 + beq .return + + ldp WA,WB,[digest] + ldp WC,WD,[digest,8] + ldr WE,[digest,16] + save_stack + +.block_loop: + mov AA, WA + mov BB, WB + mov CC, WC + mov DD, WD + mov EE, WE + + load_word_at 0 + + adr sha1key_adr, KEY_0 + ldr WK, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ldr WK, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ldr WK, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ldr WK, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add WA, AA, WA + add WB, BB, WB + add WC, CC, WC + add WD, DD, WD + add WE, EE, WE + + subs num_blocks, num_blocks, 1 + bne .block_loop + + restore_stack + stp WA,WB,[digest] + stp WC,WD,[digest,8] + str WE,[digest,16] + +.return: + ret + + .size sha1_aarch64_x1, .-sha1_aarch64_x1 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S new file mode 100644 index 000000000..c8b8dd982 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor VF.16b, VC.16b, VD.16b + and VF.16b, VB.16b, VF.16b + eor VF.16b, VD.16b, VF.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor VF.16b, VB.16b, VC.16b + eor VF.16b, VF.16b, VD.16b +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and vT0.16b, VB.16b, VC.16b + and vT1.16b, VB.16b, VD.16b + and vT2.16b, VC.16b, VD.16b + orr VF.16b, vT0.16b, vT1.16b + orr VF.16b, VF.16b, vT2.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_x4_word \windex + .endif +.endm + +// FUNC_F0 is merged into STEP_00_15 for efficiency +.macro SHA1_STEP_00_15_F0 windex:req + rev32 WORD\windex\().16b,WORD\windex\().16b + next_word=\windex+1 + load_next_word %next_word + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, VA.4s, 32 - 5 + add VE.4s, VE.4s, VK.4s + sli VT.4s, VA.4s, 5 + eor VF.16b, VC.16b, VD.16b + add VE.4s, VE.4s, WORD\windex\().4s + and VF.16b, VB.16b, VF.16b + add VE.4s, VE.4s, VT.4s + eor VF.16b, VD.16b, VF.16b + ushr VT.4s, VB.4s, 32 - 30 + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor vT0.16b,\reg_3\().16b,\reg_8\().16b + eor VT.16b,\reg_14\().16b,\reg_16\().16b + eor vT0.16b,vT0.16b,VT.16b + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, vT0.4s, 32 - 1 + add VE.4s, VE.4s, VK.4s + ushr vT1.4s, VA.4s, 32 - 5 + sli VT.4s, vT0.4s, 1 + add VE.4s, VE.4s, VT.4s + sli vT1.4s, VA.4s, 5 + mov \reg_16\().16b,VT.16b + add VE.4s, VE.4s, vT1.4s + ushr VT.4s, VB.4s, 32 - 30 + \func_f + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + + VA .req v0 + VB .req v1 + VC .req v2 + VD .req v3 + VE .req v4 + VT .req v5 + VF .req v6 + VK .req v7 + WORD0 .req v8 + WORD1 .req v9 + WORD2 .req v10 + WORD3 .req v11 + WORD4 .req v12 + WORD5 .req v13 + WORD6 .req v14 + WORD7 .req v15 + WORD8 .req v16 + WORD9 .req v17 + WORD10 .req v18 + WORD11 .req v19 + WORD12 .req v20 + WORD13 .req v21 + WORD14 .req v22 + WORD15 .req v23 + vT0 .req v24 + vT1 .req v25 + vT2 .req v26 + vAA .req v27 + vBB .req v28 + vCC .req v29 + vDD .req v30 + vEE .req v31 + TT .req v0 + sha1key_adr .req x15 + +.macro SWAP_STATES + // shifted VB is held in VT after each step + .unreq TT + TT .req VE + .unreq VE + VE .req VD + .unreq VD + VD .req VC + .unreq VC + VC .req VT + .unreq VT + VT .req VB + .unreq VB + VB .req VA + .unreq VA + VA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15_F0 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES + + .if \windex == 79 + // after 80 steps, the registers ABCDET has shifted from + // its orignal order of 012345 to 341520 + // have to swap back for both compile- and run-time correctness + mov v0.16b,v3.16b + .unreq VA + VA .req v0 + + mov vT0.16b,v2.16b + mov v2.16b,v1.16b + mov v1.16b,v4.16b + .unreq VB + VB .req v1 + .unreq VC + VC .req v2 + + mov v3.16b,v5.16b + .unreq VD + VD .req v3 + + mov v4.16b,vT0.16b + .unreq VE + VE .req v4 + + .unreq VT + VT .req v5 + .endif +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.macro sha1_single + load_x4_word 0 + + mov vAA.16B, VA.16B + mov vBB.16B, VB.16B + mov vCC.16B, VC.16B + mov vDD.16B, VD.16B + mov vEE.16B, VE.16B + + adr sha1key_adr, KEY_0 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add VA.4s, vAA.4s, VA.4s + add VB.4s, vBB.4s, VB.4s + add VC.4s, vCC.4s, VC.4s + add VD.4s, vDD.4s, VD.4s + add VE.4s, vEE.4s, VE.4s +.endm + +.macro sha1_asimd_save_stack + stp d8,d9,[sp, -64]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] + stp d14,d15,[sp, 48] +.endm + +.macro sha1_asimd_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d14,d15,[sp, 48] + ldp d8,d9,[sp],64 +.endm diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c new file mode 100644 index 000000000..9a9952ff6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state); +SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job); +SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_asimd(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_asimd(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_asimd(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_asimd(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_asimd(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_asimd_slver_02020142; +struct slver sha1_ctx_mgr_init_asimd_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_asimd_slver_02020143; +struct slver sha1_ctx_mgr_submit_asimd_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_asimd_slver_02020144; +struct slver sha1_ctx_mgr_flush_asimd_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c new file mode 100644 index 000000000..e40a344ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state); +SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job); +SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_ce(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_ce(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_ce(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_ce(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_ce(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_ce_slver_02020142; +struct slver sha1_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_ce_slver_02020143; +struct slver sha1_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_ce_slver_02020144; +struct slver sha1_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..0942c1a95 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c @@ -0,0 +1,93 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_submit) +{ + + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_submit_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_submit_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_init_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_init_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_flush_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_flush_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S new file mode 100644 index 000000000..012b15c14 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S @@ -0,0 +1,192 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +#include "sha1_asimd_common.S" + +.macro internal_load windex + // load 64-bytes from each address to maximize usage of cache line + .if \windex == 0 + mov tmp,dataptr + ld1 {WORD0.4s},[data0],16 + ld1 {WORD4.4s},[data0],16 + ld1 {WORD8.4s},[data0],16 + ld1 {WORD12.4s},[data0],16 + + ld1 {WORD1.4s},[data1],16 + ld1 {WORD5.4s},[data1],16 + ld1 {WORD9.4s},[data1],16 + ld1 {WORD13.4s},[data1],16 + + ld1 {WORD2.4s},[data2],16 + ld1 {WORD6.4s},[data2],16 + ld1 {WORD10.4s},[data2],16 + ld1 {WORD14.4s},[data2],16 + + ld1 {WORD3.4s},[data3],16 + ld1 {WORD7.4s},[data3],16 + ld1 {WORD11.4s},[data3],16 + ld1 {WORD15.4s},[data3],16 + + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[0],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[1],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[2],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[3],[tmp],16 + .endif + + .if \windex == 4 + mov tmp,dataptr + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[0],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[1],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[2],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[3],[tmp],16 + .endif + + .if \windex == 8 + mov tmp,dataptr + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[0],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[1],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[2],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[3],[tmp],16 + .endif + + .if \windex == 12 + mov tmp,dataptr + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[0],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[1],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[2],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[3],[tmp],16 + .endif +.endm + +.macro load_x4_word idx:req + internal_load \idx + ld1 {WORD\idx\().16b},[dataptr],16 +.endm + +/* + * void sha1_mb_asimd_x4(SHA1_JOB *j0, SHA1_JOB*j1, SHA1_JOB*j2, SHA1_JOB *j3, int blocks) + */ + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + num_blocks .req w4 + tmp .req x5 + data0 .req x6 + data1 .req x7 + data2 .req x8 + data3 .req x9 + databuf .req x10 + dataptr .req x11 + savedsp .req x12 + + .global sha1_mb_asimd_x4 + .type sha1_mb_asimd_x4, %function +sha1_mb_asimd_x4: + cmp num_blocks, #0 + beq .return + sha1_asimd_save_stack + mov savedsp,sp + sub databuf,sp,256 + mov tmp,63 + bic databuf,databuf,tmp + mov sp,databuf + + add tmp,job0,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16 + ld1 {VE.s}[0],[tmp] + ldr data0,[job0] + + add tmp,job1,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16 + ld1 {VE.s}[1],[tmp] + ldr data1,[job1] + + add tmp,job2,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16 + ld1 {VE.s}[2],[tmp] + ldr data2,[job2] + + add tmp,job3,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16 + ld1 {VE.s}[3],[tmp] + ldr data3,[job3] + +.block_loop: + mov dataptr,databuf + sha1_single + subs num_blocks, num_blocks, 1 + bne .block_loop + + add tmp,job0,64 + st4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16 + st1 {VE.s}[0],[tmp] + + add tmp,job1,64 + st4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16 + st1 {VE.s}[1],[tmp] + + add tmp,job2,64 + st4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16 + st1 {VE.s}[2],[tmp] + + add tmp,job3,64 + st4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16 + st1 {VE.s}[3],[tmp] + + mov sp,savedsp + sha1_asimd_restore_stack +.return: + ret + + .size sha1_mb_asimd_x4, .-sha1_mb_asimd_x4 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c new file mode 100644 index 000000000..4b34e7b53 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c @@ -0,0 +1,217 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include +#include "endian_helper.h" + +extern void sha1_aarch64_x1(const uint8_t * data, int num_blocks, uint32_t digest[]); +static inline void sha1_job_x1(SHA1_JOB * job, int blocks) +{ + sha1_aarch64_x1(job->buffer, blocks, job->result_digest); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA1_MB_ASIMD_MAX_LANES 4 +void sha1_mb_asimd_x4(SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) + +void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SHA1_MB_ASIMD_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + // lanes > SHA1_MB_ASIMD_MAX_LANES is invalid lane + for (; i < SHA1_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes, blocks; + int lane_idx_array[SHA1_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } + lanes = 0, len = 0; + for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); + blocks = len >> 4; + + /* for less-than-3-lane job, ASIMD really does not have much advantage + * compared to scalar due to wasted >= 50% capacity + * therefore we only run ASIMD for 3/4 lanes of data + */ + if (lanes == SHA1_MB_ASIMD_MAX_LANES) { + sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, blocks); + } else if (lanes == 3) { + /* in case of 3 lanes, apparently ASIMD will still operate as if + * there were four lanes of data in processing (waste 25% capacity) + * theoretically we can let ASIMD implementation know the number of lanes + * so that it could "at least" save some memory loading time + * but in practice, we can just pass lane 0 as dummy for similar + * cache performance + */ + SHA1_JOB dummy; + dummy.buffer = state->ldata[lane_idx_array[0]].job_in_lane->buffer; + dummy.len = state->ldata[lane_idx_array[0]].job_in_lane->len; + sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane, + &dummy, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, blocks); + } else { + sha1_job_x1(state->ldata[lane_idx_array[0]].job_in_lane, blocks); + if (lanes >= 2) { + sha1_job_x1(state->ldata[lane_idx_array[1]].job_in_lane, blocks); + } + } + + // only return the min length job + for (i = 0; i < SHA1_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + return lane_idx; + +} + +static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state) +{ + int i; + SHA1_JOB *ret = NULL; + + for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ + int lane_idx; + // add job into lanes + lane_idx = state->unused_lanes & 0xf; + // fatal error + assert(lane_idx < SHA1_MB_ASIMD_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA1_JOB *ret; + + // add job into lanes + sha1_mb_mgr_insert_job(state, job); + + ret = sha1_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + // submit will wait all lane has data + if (state->num_lanes_inuse < SHA1_MB_ASIMD_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha1_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha1_mb_mgr_do_jobs(state); +#endif + + // ~ i = lane_idx; + ret = sha1_mb_mgr_free_lane(state); + return ret; +} + +SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state) +{ + SHA1_JOB *ret; + ret = sha1_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha1_mb_mgr_do_jobs(state); + return sha1_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c new file mode 100644 index 000000000..1dfd67d0c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c @@ -0,0 +1,208 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA1_MB_CE_MAX_LANES 2 +#if SHA1_MB_CE_MAX_LANES >=2 +void sha1_mb_ce_x2(SHA1_JOB *, SHA1_JOB *, int); +#endif +void sha1_mb_ce_x1(SHA1_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SHA1_MB_CE_MAX_LANES is invalid lane + for (; i < SHA1_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SHA1_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SHA1_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha1_mb_ce_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); + +#if SHA1_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sha1_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sha1_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SHA1_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state) +{ + int i; + SHA1_JOB *ret = NULL; + + for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SHA1_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA1_JOB *ret; + + //add job into lanes + sha1_mb_mgr_insert_job(state, job); + + ret = sha1_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SHA1_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha1_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha1_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sha1_mb_mgr_free_lane(state); + return ret; +} + +SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state) +{ + SHA1_JOB *ret; + ret = sha1_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha1_mb_mgr_do_jobs(state); + return sha1_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S new file mode 100644 index 000000000..bb1929d76 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface sha1_ctx_mgr_submit +mbin_interface sha1_ctx_mgr_init +mbin_interface sha1_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S new file mode 100644 index 000000000..22f736793 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S @@ -0,0 +1,194 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + +/** +maros for round 4-67 +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req + sha1h \e0\()_s, \abcd\()_s + \inst \abcd\()_q,\e1\()_s,\tmp1\()_v.4s + add \tmp1\()_v.4s,\msg3\()_v.4s,\k\()_v.4s + sha1su1 \msg0\()_v.4s,\msg3\()_v.4s + sha1su0 \msg1\()_v.4s,\msg2\()_v.4s,\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key_0,28 + declare_var_vector_reg key_1,29 + declare_var_vector_reg key_2,30 + declare_var_vector_reg key_3,31 + + +/* +digest variables +*/ + declare_var_vector_reg abcd,0 + declare_var_vector_reg e0,1 + declare_var_vector_reg e1,2 + declare_var_vector_reg abcd_saved,3 + declare_var_vector_reg e0_saved,4 +/* +Message variables +*/ + declare_var_vector_reg msg_0,16 + declare_var_vector_reg msg_1,17 + declare_var_vector_reg msg_2,18 + declare_var_vector_reg msg_3,19 +/* +Temporay variables +*/ + declare_var_vector_reg tmp_0,5 + declare_var_vector_reg tmp_1,6 + +/* + void sha1_mb_ce_x1(SHA1_JOB * job, int len); +*/ +/* +Arguements list +*/ + job .req x0 + len .req w1 + data .req x2 + tmp .req x3 + .global sha1_mb_ce_x1 + .type sha1_mb_ce_x1, %function +sha1_mb_ce_x1: + ldr data, [job] + ldr abcd_q, [job, 64] + ldr e0_s, [job, 80] + adr tmp, KEY + ld1 {key_0_v.4s-key_3_v.4s},[tmp] + +start_loop: + + //load msgs + ld1 {msg_0_v.4s-msg_3_v.4s},[data] + + //adjust loop parameter + add data,data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov abcd_saved_v.16b,abcd_v.16b + mov e0_saved_v.16b,e0_v.16b + + rev32 msg_0_v.16b,msg_0_v.16b + rev32 msg_1_v.16b,msg_1_v.16b + add tmp_0_v.4s,msg_0_v.4s,key_0_v.4s + rev32 msg_2_v.16b,msg_2_v.16b + add tmp_1_v.4s,msg_1_v.4s,key_0_v.4s + rev32 msg_3_v.16b,msg_3_v.16b + + /* rounds 0-3 */ + sha1h e1_s,abcd_s + sha1c abcd_q,e0_s,tmp_0_v.4s + add tmp_0_v.4s,msg_2_v.4s,key_0_v.4s + sha1su0 msg_0_v.4s,msg_1_v.4s,msg_2_v.4s + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0 + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3 + + /* rounds 68-71 */ + sha1h e0_s,abcd_s + sha1p abcd_q,e1_s,tmp_1_v.4s + add tmp_1_v.4s,msg_3_v.4s,key_3_v.4s + sha1su1 msg_0_v.4s,msg_3_v.4s + + /* rounds 72-75 */ + sha1h e1_s,abcd_s + sha1p abcd_q,e0_s,tmp_0_v.4s + + /* rounds 76-79 */ + sha1h e0_s,abcd_s + sha1p abcd_q,e1_s,tmp_1_v.4s + + + + add abcd_v.4s,abcd_v.4s,abcd_saved_v.4s + add e0_v.2s,e0_v.2s,e0_saved_v.2s + + + bgt start_loop + str abcd_q, [job, 64] + str e0_s, [job, 80] + + ret + + .size sha1_mb_ce_x1, .-sha1_mb_ce_x1 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S new file mode 100644 index 000000000..93f653ad2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S @@ -0,0 +1,253 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + +/** +maros for round 4-67 +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req + sha1h l0_\e0\()_s, l0_\abcd\()_s + sha1h l1_\e0\()_s, l1_\abcd\()_s + + \inst l0_\abcd\()_q,l0_\e1\()_s,l0_\tmp1\()_v.4s + \inst l1_\abcd\()_q,l1_\e1\()_s,l1_\tmp1\()_v.4s + + add l0_\tmp1\()_v.4s,l0_\msg3\()_v.4s,\k\()_v.4s + add l1_\tmp1\()_v.4s,l1_\msg3\()_v.4s,\k\()_v.4s + + sha1su1 l0_\msg0\()_v.4s,l0_\msg3\()_v.4s + sha1su1 l1_\msg0\()_v.4s,l1_\msg3\()_v.4s + + sha1su0 l0_\msg1\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha1su0 l1_\msg1\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key_0,28 + declare_var_vector_reg key_1,29 + declare_var_vector_reg key_2,30 + declare_var_vector_reg key_3,31 + + +/* +lane variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_e0,1 + declare_var_vector_reg l0_e1,2 + declare_var_vector_reg l0_abcd_saved,3 + declare_var_vector_reg l0_e0_saved,4 + declare_var_vector_reg l0_tmp_0,5 + declare_var_vector_reg l0_tmp_1,6 + declare_var_vector_reg l0_msg_0,16 + declare_var_vector_reg l0_msg_1,17 + declare_var_vector_reg l0_msg_2,18 + declare_var_vector_reg l0_msg_3,19 + + declare_var_vector_reg l1_abcd,7 + declare_var_vector_reg l1_e0,8 + declare_var_vector_reg l1_e1,9 + declare_var_vector_reg l1_abcd_saved,24 + declare_var_vector_reg l1_e0_saved,25 + declare_var_vector_reg l1_tmp_0,26 + declare_var_vector_reg l1_tmp_1,27 + declare_var_vector_reg l1_msg_0,20 + declare_var_vector_reg l1_msg_1,21 + declare_var_vector_reg l1_msg_2,22 + declare_var_vector_reg l1_msg_3,23 + +/* + void sha1_mb_ce_x2(SHA1_JOB * job_0, SHA1_JOB * job_1,int len); +*/ + l0_job .req x0 + l1_job .req x1 + len .req w2 + + l0_data .req x3 + l1_data .req x4 + tmp .req x5 + .global sha1_mb_ce_x2 + .type sha1_mb_ce_x2, %function +sha1_mb_ce_x2: + //push d8,d9 to stack + stp d8, d9, [sp, -256]! + + adr tmp, KEY + ld1 {key_0_v.4s-key_3_v.4s},[tmp] + ldr l0_data, [l0_job] + ldr l1_data, [l1_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_e0_s, [l0_job, 80] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_e0_s, [l1_job, 80] + +start_loop: + + //load msgs + ld1 {l0_msg_0_v.4s-l0_msg_3_v.4s},[l0_data] + ld1 {l1_msg_0_v.4s-l1_msg_3_v.4s},[l1_data] + + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b, l0_abcd_v.16b + mov l0_e0_saved_v.16b, l0_e0_v.16b + mov l1_abcd_saved_v.16b, l1_abcd_v.16b + mov l1_e0_saved_v.16b, l1_e0_v.16b + + rev32 l0_msg_0_v.16b, l0_msg_0_v.16b + rev32 l0_msg_1_v.16b, l0_msg_1_v.16b + add l0_tmp_0_v.4s, l0_msg_0_v.4s, key_0_v.4s + rev32 l0_msg_2_v.16b, l0_msg_2_v.16b + add l0_tmp_1_v.4s, l0_msg_1_v.4s, key_0_v.4s + rev32 l0_msg_3_v.16b, l0_msg_3_v.16b + + rev32 l1_msg_0_v.16b, l1_msg_0_v.16b + rev32 l1_msg_1_v.16b, l1_msg_1_v.16b + add l1_tmp_0_v.4s, l1_msg_0_v.4s, key_0_v.4s + rev32 l1_msg_2_v.16b, l1_msg_2_v.16b + add l1_tmp_1_v.4s, l1_msg_1_v.4s, key_0_v.4s + rev32 l1_msg_3_v.16b, l1_msg_3_v.16b + + /* rounds 0-3 */ + sha1h l0_e1_s, l0_abcd_s + sha1c l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s + add l0_tmp_0_v.4s, l0_msg_2_v.4s, key_0_v.4s + sha1su0 l0_msg_0_v.4s, l0_msg_1_v.4s, l0_msg_2_v.4s + + sha1h l1_e1_s, l1_abcd_s + sha1c l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s + add l1_tmp_0_v.4s, l1_msg_2_v.4s, key_0_v.4s + sha1su0 l1_msg_0_v.4s, l1_msg_1_v.4s, l1_msg_2_v.4s + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0 + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3 + + /* rounds 68-71 */ + sha1h l0_e0_s, l0_abcd_s + sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s + add l0_tmp_1_v.4s, l0_msg_3_v.4s, key_3_v.4s + sha1su1 l0_msg_0_v.4s, l0_msg_3_v.4s + + sha1h l1_e0_s, l1_abcd_s + sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s + add l1_tmp_1_v.4s, l1_msg_3_v.4s, key_3_v.4s + sha1su1 l1_msg_0_v.4s, l1_msg_3_v.4s + + /* rounds 72-75 */ + sha1h l0_e1_s, l0_abcd_s + sha1p l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s + + sha1h l1_e1_s, l1_abcd_s + sha1p l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s + + /* rounds 76-79 */ + sha1h l0_e0_s, l0_abcd_s + sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s + + sha1h l1_e0_s, l1_abcd_s + sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s + + + + add l0_abcd_v.4s, l0_abcd_v.4s, l0_abcd_saved_v.4s + add l0_e0_v.2s, l0_e0_v.2s, l0_e0_saved_v.2s + add l1_abcd_v.4s, l1_abcd_v.4s, l1_abcd_saved_v.4s + add l1_e0_v.2s, l1_e0_v.2s, l1_e0_saved_v.2s + + + + + bgt start_loop + + str l0_abcd_q, [l0_job, 64] + str l0_e0_s, [l0_job, 80] + + + str l1_abcd_q, [l1_job, 64] + str l1_e0_s, [l1_job, 80] + + //pop d8,d9 from stack + ldp d8, d9, [sp], 256 + ret + + .size sha1_mb_ce_x2, .-sha1_mb_ce_x2 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c new file mode 100644 index 000000000..ad91d64ac --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c @@ -0,0 +1,265 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx_slver_02020142; +struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_avx_slver_02020143; +struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_avx_slver_02020144; +struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c new file mode 100644 index 000000000..85977d4c2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c @@ -0,0 +1,264 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx2_slver_04020145; +struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 }; + +struct slver sha1_ctx_mgr_submit_avx2_slver_04020146; +struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 }; + +struct slver sha1_ctx_mgr_flush_avx2_slver_04020147; +struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c new file mode 100644 index 000000000..90e087163 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c @@ -0,0 +1,271 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx512_slver_0600014a; +struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 }; + +struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b; +struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 }; + +struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c; +struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c new file mode 100644 index 000000000..2013f829a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c @@ -0,0 +1,281 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +/** + * sha1_ctx_avx512_ni related functions are aiming to utilize Canon Lake. + * Since SHANI is still slower than multibuffer for full lanes, + * sha1_ctx_mgr_init_avx512_ni and sha1_ctx_mgr_submit_avx512_ni are + * similar with their avx512 versions. + * sha1_ctx_mgr_flush_avx512_ni is different. It will call + * sha1_mb_mgr_flush_avx512_ni which would use shani when lanes are less + * than a threshold. + * + */ +#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI) + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx512_ni(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512_ni(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx512_ni_slver_080002c4; +struct slver sha1_ctx_mgr_init_avx512_ni_slver = { 0x02c4, 0x00, 0x08 }; + +struct slver sha1_ctx_mgr_submit_avx512_ni_slver_080002c5; +struct slver sha1_ctx_mgr_submit_avx512_ni_slver = { 0x02c5, 0x00, 0x08 }; + +struct slver sha1_ctx_mgr_flush_avx512_ni_slver_080002c6; +struct slver sha1_ctx_mgr_flush_avx512_ni_slver = { 0x02c6, 0x00, 0x08 }; + +#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c new file mode 100644 index 000000000..90481efd0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c @@ -0,0 +1,325 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (b ^ c ^ d) +#define F3(b,c,d) ((b & c) | (d & (b | c))) +#define F4(b,c,d) (b ^ c ^ d) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sha1_single(const void *data, uint32_t digest[]); +static inline void hash_init_digest(SHA1_WORD_T * digest); + +void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr) +{ +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + sha1_init(ctx, buffer, len); + sha1_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + sha1_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sha1_update(ctx, buffer, len); + sha1_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sha1_init(ctx, buffer, len); + remain_len = sha1_update(ctx, buffer, len); + sha1_final(ctx, remain_len); + } + + return ctx; +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SHA1_BLOCK_SIZE) { + sha1_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SHA1_BLOCK_SIZE); + remain_len -= SHA1_BLOCK_SIZE; + ctx->total_length += SHA1_BLOCK_SIZE; + } + + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sha1_single(buf, digest); + if (i == 2 * SHA1_BLOCK_SIZE) { + sha1_single(buf + SHA1_BLOCK_SIZE, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +void sha1_single(const void *data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sha1_ctx_mgr_init_base_slver_00000192; +struct slver sha1_ctx_mgr_init_base_slver = { 0x0192, 0x00, 0x00 }; + +struct slver sha1_ctx_mgr_submit_base_slver_00000193; +struct slver sha1_ctx_mgr_submit_base_slver = { 0x0193, 0x00, 0x00 }; + +struct slver sha1_ctx_mgr_flush_base_slver_00000194; +struct slver sha1_ctx_mgr_flush_base_slver = { 0x0194, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c new file mode 100644 index 000000000..32eb07f6e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include "sha1_mb.h" +#include "memcpy_inline.h" + +extern void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr); +extern SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); +extern SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr); + +void sha1_ctx_mgr_init(SHA1_HASH_CTX_MGR * mgr) +{ + return sha1_ctx_mgr_init_base(mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sha1_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush(SHA1_HASH_CTX_MGR * mgr) +{ + return sha1_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c new file mode 100644 index 000000000..db70ee015 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c @@ -0,0 +1,251 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_sse(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_sse_slver_00020139; +struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 }; + +struct slver sha1_ctx_mgr_submit_sse_slver_00020140; +struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 }; + +struct slver sha1_ctx_mgr_flush_sse_slver_00020141; +struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c new file mode 100644 index 000000000..d3c7687d2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c @@ -0,0 +1,259 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_SHANI + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_sse_ni(SHA1_HASH_CTX_MGR * mgr) +{ + // Same with sse + sha1_mb_mgr_init_sse(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse_ni(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_sse_ni_slver_070002c1; +struct slver sha1_ctx_mgr_init_sse_ni_slver = { 0x02c1, 0x00, 0x07 }; + +struct slver sha1_ctx_mgr_submit_sse_ni_slver_070002c2; +struct slver sha1_ctx_mgr_submit_sse_ni_slver = { 0x02c2, 0x00, 0x07 }; + +struct slver sha1_ctx_mgr_flush_sse_ni_slver_070002c3; +struct slver sha1_ctx_mgr_flush_sse_ni_slver = { 0x02c3, 0x00, 0x07 }; + +#endif // HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm new file mode 100644 index 000000000..1c9a66fd4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm @@ -0,0 +1,67 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SHA1_SB_THRESHOLD_SSE 1 +%define SHA1_SB_THRESHOLD_AVX 1 +%define SHA1_SB_THRESHOLD_AVX2 1 +%define SHA1_SB_THRESHOLD_AVX512 1 +%define SHA1_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha1_mb +%define SHA1_NI_SB_THRESHOLD_AVX512 6 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA1_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA1_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 4, 4 ; length in bytes +FIELD _result_digest, 5*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +%assign _SHA1_JOB_size _FIELD_OFFSET +%assign _SHA1_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c new file mode 100644 index 000000000..4bf2e09b5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c @@ -0,0 +1,146 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SHA1_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SHA1_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SHA1_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SHA1_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sha1_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SHA1_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sha1_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sha1_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else if (num_unchanged) + printf("SHA-NI is used when %d or %d jobs are uncompleted\n", + num_unchanged, num_unchanged + 1); + else + printf("SHA-NI is not used, or used for last job\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm new file mode 100644 index 000000000..21c81403b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm @@ -0,0 +1,74 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA1 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA1_ARGS_X16 +;;; name size align +FIELD _digest, 4*5*16, 16 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA1_ARGS_X4_size _FIELD_OFFSET +%assign _SHA1_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA1_ARGS_X8_size _FIELD_OFFSET +%assign _SHA1_ARGS_X8_align _STRUCT_ALIGN +%assign _SHA1_ARGS_X16_size _FIELD_OFFSET +%assign _SHA1_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..c5fd71300 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm @@ -0,0 +1,247 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_avx +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx, function +sha1_mb_mgr_flush_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..a47ae2838 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm @@ -0,0 +1,273 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x8_avx2 +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha1_mb_x8_avx2 and sha1_opt_x1 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx2, function +sha1_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX2 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*32] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..5e3db5b9b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm @@ -0,0 +1,271 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha1_mb_x16_avx512 +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged +%define job arg2 ; unused +%define len2 arg2 ; unused + +; idx must be a register not clobberred by sha1_mb_x16_avx512 +%define idx rbp ; unchanged + +%define unused_lanes rbx ; covered +%define lane_data rbx ; covered +%define tmp2 rbx ; covered + +%define num_lanes_inuse r9 ; covered + +%define job_rax rax ; covered +%define tmp rax ; unused + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx512, function +sha1_mb_mgr_flush_avx512: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_mgr_flush_avx512 +no_sha1_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm new file mode 100644 index 000000000..4170b6c73 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm @@ -0,0 +1,278 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + +extern sha1_mb_x16_avx512 +extern sha1_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged +%define job arg2 ; unused +%define len2 arg2 ; unused + +; idx must be a register not clobberred by sha1_mb_x16_avx512 +%define idx rbp ; unchanged + +%define unused_lanes rbx ; covered +%define lane_data rbx ; covered +%define tmp2 rbx ; covered + +%define num_lanes_inuse r9 ; covered + +%define job_rax rax ; covered +%define tmp rax ; unused + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx512_ni, function +sha1_mb_mgr_flush_avx512_ni: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + + %else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_avx512_ni + no_sha1_mb_mgr_flush_avx512_ni: + %endif + %endif ; HAVE_AS_KNOWS_SHANI +%else +%ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_avx512_ni + no_sha1_mb_mgr_flush_avx512_ni: + %endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..2a4c4b50a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm @@ -0,0 +1,249 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_sse +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_sse, function +sha1_mb_mgr_flush_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm new file mode 100644 index 000000000..ea3cffd33 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm @@ -0,0 +1,256 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha1_mb_x4_sse +extern sha1_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_sse_ni(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_sse_ni, function +sha1_mb_mgr_flush_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_sse_ni + no_sha1_mb_mgr_flush_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c new file mode 100644 index 000000000..b6124486a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c new file mode 100644 index 000000000..033fb3c9f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c new file mode 100644 index 000000000..811c4a9dd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF3210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_MIN_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..49c018138 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_avx + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be in a register not clobberred by sha1_mult +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*4 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx, function +sha1_mb_mgr_submit_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*4 + 16*0], xmm6 + vmovdqa [rsp + 8*4 + 16*1], xmm7 + vmovdqa [rsp + 8*4 + 16*2], xmm8 + vmovdqa [rsp + 8*4 + 16*3], xmm9 + vmovdqa [rsp + 8*4 + 16*4], xmm10 + vmovdqa [rsp + 8*4 + 16*5], xmm11 + vmovdqa [rsp + 8*4 + 16*6], xmm12 + vmovdqa [rsp + 8*4 + 16*7], xmm13 + vmovdqa [rsp + 8*4 + 16*8], xmm14 + vmovdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*4 + 16*0] + vmovdqa xmm7, [rsp + 8*4 + 16*1] + vmovdqa xmm8, [rsp + 8*4 + 16*2] + vmovdqa xmm9, [rsp + 8*4 + 16*3] + vmovdqa xmm10, [rsp + 8*4 + 16*4] + vmovdqa xmm11, [rsp + 8*4 + 16*5] + vmovdqa xmm12, [rsp + 8*4 + 16*6] + vmovdqa xmm13, [rsp + 8*4 + 16*7] + vmovdqa xmm14, [rsp + 8*4 + 16*8] + vmovdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..95b4f1715 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm @@ -0,0 +1,250 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "memcpy.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%define extra_blocks rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%define extra_blocks rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +; idx must be a register not clobberred by sha1_x8_avx2 +%define idx r8 +%define last_len r8 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx2, function +sha1_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + + shl len,4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + + vmovd [state + _args_digest + 4*lane + 0*32], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*32], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*32] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..a4f9389a1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm @@ -0,0 +1,248 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "memcpy.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha1_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged, mb_x16's input1 +%define job arg2 ; arg2 unused +%define len2 arg2 ; arg2 unused, mb_x16's input2 + +; idx must be a register not clobberred by sha1_x16_avx512 +%define idx r8 ; unchanged + +%define p r11 ; unused + +%define unused_lanes rbx ; covered + +%define job_rax rax ; covered +%define len rax ; unused + +%define lane rbp ; unused + +%define tmp r9 ; covered +%define num_lanes_inuse r9 ; covered + +%define lane_data r10 ; covered + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx512, function +sha1_mb_mgr_submit_avx512: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + + shl len,4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + + vmovd [state + _args_digest + 4*lane + 0*64], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*64], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF ; idx represent min length index + shr len2, 4 ; size in blocks + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_mgr_submit_avx512 +no_sha1_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..9989a9a1d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_sse + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*4 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_sse, function +sha1_mb_mgr_submit_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + movdqa [rsp + 8*4 + 16*0], xmm6 + movdqa [rsp + 8*4 + 16*1], xmm7 + movdqa [rsp + 8*4 + 16*2], xmm8 + movdqa [rsp + 8*4 + 16*3], xmm9 + movdqa [rsp + 8*4 + 16*4], xmm10 + movdqa [rsp + 8*4 + 16*5], xmm11 + movdqa [rsp + 8*4 + 16*6], xmm12 + movdqa [rsp + 8*4 + 16*7], xmm13 + movdqa [rsp + 8*4 + 16*8], xmm14 + movdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*4 + 16*0] + movdqa xmm7, [rsp + 8*4 + 16*1] + movdqa xmm8, [rsp + 8*4 + 16*2] + movdqa xmm9, [rsp + 8*4 + 16*3] + movdqa xmm10, [rsp + 8*4 + 16*4] + movdqa xmm11, [rsp + 8*4 + 16*5] + movdqa xmm12, [rsp + 8*4 + 16*6] + movdqa xmm13, [rsp + 8*4 + 16*7] + movdqa xmm14, [rsp + 8*4 + 16*8] + movdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm new file mode 100644 index 000000000..979324de4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm @@ -0,0 +1,290 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha1_mb_x4_sse +extern sha1_ni_x2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*6 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_sse_ni(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_sse_ni, function +sha1_mb_mgr_submit_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + movdqa [rsp + 8*4 + 16*0], xmm6 + movdqa [rsp + 8*4 + 16*1], xmm7 + movdqa [rsp + 8*4 + 16*2], xmm8 + movdqa [rsp + 8*4 + 16*3], xmm9 + movdqa [rsp + 8*4 + 16*4], xmm10 + movdqa [rsp + 8*4 + 16*5], xmm11 + movdqa [rsp + 8*4 + 16*6], xmm12 + movdqa [rsp + 8*4 + 16*7], xmm13 + movdqa [rsp + 8*4 + 16*8], xmm14 + movdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + + cmp unused_lanes, 0xF32 ; we will process two jobs at the same time + jne return_null ; wait for another sha_ni job + + ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func + %if SHA1_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb + ; shani glue code + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + ; lensN-len2=idx + sub lens0, len2 + sub lens1, len2 + + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x2 + ; state and idx are intact + + %else + ; original mb code + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + %endif + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*4 + 16*0] + movdqa xmm7, [rsp + 8*4 + 16*1] + movdqa xmm8, [rsp + 8*4 + 16*2] + movdqa xmm9, [rsp + 8*4 + 16*3] + movdqa xmm10, [rsp + 8*4 + 16*4] + movdqa xmm11, [rsp + 8*4 + 16*5] + movdqa xmm12, [rsp + 8*4 + 16*6] + movdqa xmm13, [rsp + 8*4 + 16*7] + movdqa xmm14, [rsp + 8*4 + 16*8] + movdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_submit_sse_ni + no_sha1_mb_mgr_submit_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c new file mode 100644 index 000000000..3925a6f4b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c @@ -0,0 +1,159 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha1_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA1(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c new file mode 100644 index 000000000..4eeeaba0a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c @@ -0,0 +1,202 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha1_mb test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha1_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c new file mode 100644 index 000000000..aaa52a0ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c @@ -0,0 +1,297 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA1_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA1_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha1 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha1_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha1_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha1_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha1_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha1_ctx_mgr_init(mgr); + + // Run sha1_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA1_BLOCK_SIZE + + SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA1_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha1_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha1_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA1_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha1_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c new file mode 100644 index 000000000..6261bbf44 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c @@ -0,0 +1,233 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha1_mb.h" +#include "endian_helper.h" + +typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS]; + +#define MSGS 7 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS +static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +static DigestSHA1 expResultDigest1 = + { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 }; + +static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +static DigestSHA1 expResultDigest2 = + { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D }; + +static uint8_t msg3[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<"; +static DigestSHA1 expResultDigest3 = + { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 }; + +static uint8_t msg4[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +static DigestSHA1 expResultDigest4 = + { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 }; + +static uint8_t msg5[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?"; +static DigestSHA1 expResultDigest5 = + { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 }; + +static uint8_t msg6[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +static DigestSHA1 expResultDigest6 = + { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 }; + +static uint8_t msg7[] = ""; +static DigestSHA1 expResultDigest7 = + { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 }; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7 +}; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sha1_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha1_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha1 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c new file mode 100644 index 000000000..bd8e5e527 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c @@ -0,0 +1,128 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha1_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha1_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha1_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha1_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha1" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha1 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..0b4438d53 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,132 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha1_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SHA1_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha1_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha1_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sha1_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha1" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sha1 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm new file mode 100644 index 000000000..d64ffe2bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm @@ -0,0 +1,563 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;; code to compute oct SHA1 using AVX-512 +;; outer calling routine takes care of save and restore of XMM registers + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define KT zmm5 +%define AA zmm6 +%define BB zmm7 +%define CC zmm8 +%define DD zmm9 +%define EE zmm10 +%define TMP0 zmm11 +%define TMP1 zmm12 +%define TMP2 zmm13 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%F_IMMED %2 + + ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt + ; E=D, D=C, C=ROTL_30(B), B=A, A=T + + ; Ft + ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) + ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D + ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) + + vmovdqa32 TMP1, B ; Copy B + vpaddd E, E, %%WT ; E = E + Wt + vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) + vpaddd E, E, KT ; E = E + Wt + Kt + vprold TMP0, A, 5 ; TMP0 = ROTL_5(A) + vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt + vprold B, B, 30 ; B = ROTL_30(B) + vpaddd E, E, TMP0 ; E = T + + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79 4 +%define %%WT %1 +%define %%WTp2 %2 +%define %%WTp8 %3 +%define %%WTp13 %4 + ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) + ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) + vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 + vpxord %%WT, %%WT, %%WTp13 + vprold %%WT, %%WT, 1 +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +align 64 + +; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha1_mb_x16_avx512) +sha1_mb_x16_avx512: + endbranch + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + + xor IDX, IDX + + ;; transpose input onto stack + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + vmovups W0,[inp0+IDX] + vmovups W1,[inp1+IDX] + vmovups W2,[inp2+IDX] + vmovups W3,[inp3+IDX] + vmovups W4,[inp4+IDX] + vmovups W5,[inp5+IDX] + vmovups W6,[inp6+IDX] + vmovups W7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups W8, [inp0+IDX] + vmovups W9, [inp1+IDX] + vmovups W10,[inp2+IDX] + vmovups W11,[inp3+IDX] + vmovups W12,[inp4+IDX] + vmovups W13,[inp5+IDX] + vmovups W14,[inp6+IDX] + vmovups W15,[inp7+IDX] + +lloop: + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + add IDX, 64 + + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + +%assign I 0 +%rep 16 + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + + ; Save digests for later addition + vmovdqa32 AA, A + vmovdqa32 BB, B + vmovdqa32 CC, C + vmovdqa32 DD, D + vmovdqa32 EE, E + + vmovdqa32 KT, [K00_19] +%assign I 0xCA +%assign J 0 +%assign K 2 +%assign L 8 +%assign M 13 +%assign N 0 +%rep 64 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %if N = 19 + vmovdqa32 KT, [K20_39] + %assign I 0x96 + %elif N = 39 + vmovdqa32 KT, [K40_59] + %assign I 0xE8 + %elif N = 59 + vmovdqa32 KT, [K60_79] + %assign I 0x96 + %endif +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%assign N (N+1) +%endrep + + ; Check if this is the last block + sub SIZE, 1 + je lastLoop + +%assign I 0x96 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_00_15 APPEND(W,J), J +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + jmp lloop + +lastLoop: +; Need to reset argument rotation values to Round 64 values +%xdefine TMP_ A +%xdefine A B +%xdefine B C +%xdefine C D +%xdefine D E +%xdefine E TMP_ + + ; Process last 16 rounds +%assign I 0x96 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + ; Write out digest + ; Do we need to untranspose digests??? + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + + ret + +section .data +align 64 +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_x16_avx512 +no_sha1_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm new file mode 100644 index 000000000..eb67309da --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm @@ -0,0 +1,416 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA1 using AVX +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = ((B & C) | ((~ B) & D) ) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpand %%regF, %%regB,%%regC + vpandn %%regT, %%regB,%%regD + vpor %%regF, %%regT,%%regF +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define XMM_SAVE ((15-15)*16 + 1*8) +%define FRAMESZ 16*16 + XMM_SAVE +%define _XMM FRAMESZ - XMM_SAVE + +%define VMOVPS vmovups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (4*5*4) + +;%ifdef LINUX +%ifidn __OUTPUT_FORMAT__, elf64 + %define ARG1 rdi + %define ARG2 rsi +%else + ; Windows + %define ARG1 rcx + %define ARG2 rdx +%endif + +align 32 + +; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks); +; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used) +; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1 +; +; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15 +; +mk_global sha1_mb_x4_avx, function, internal +sha1_mb_x4_avx: + endbranch + + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8 + + ;; Initialize digests + vmovdqa A, [ARG1 + 0*16] + vmovdqa B, [ARG1 + 1*16] + vmovdqa C, [ARG1 + 2*16] + vmovdqa D, [ARG1 + 3*16] + vmovdqa E, [ARG1 + 4*16] + + ;; load input pointers + mov inp0,[ARG1 + _data_ptr + 0*8] + mov inp1,[ARG1 + _data_ptr + 1*8] + mov inp2,[ARG1 + _data_ptr + 2*8] + mov inp3,[ARG1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + VMOVPS T2,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T4,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vpshufb T0, T0, F + vmovdqa [rsp+(I*4+0)*16],T0 + vpshufb T1, T1, F + vmovdqa [rsp+(I*4+1)*16],T1 + vpshufb T2, T2, F + vmovdqa [rsp+(I*4+2)*16],T2 + vpshufb T3, T3, F + vmovdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E + +;; +;; perform 0-79 steps +;; + vmovdqa K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [rsp + ((16 - 16) & 15) * 16] + vmovdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqa K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqa K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqa K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub ARG2, 1 + jne lloop + + ; write out digests + vmovdqa [ARG1 + 0*16], A + vmovdqa [ARG1 + 1*16], B + vmovdqa [ARG1 + 2*16], C + vmovdqa [ARG1 + 3*16], D + vmovdqa [ARG1 + 4*16], E + + ; update input pointers + add inp0, IDX + mov [ARG1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [ARG1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [ARG1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [ARG1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm new file mode 100644 index 000000000..5677dce73 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm @@ -0,0 +1,413 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA1 using SSE +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + paddd %%regE,[rsp + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [rsp + ((%%memW - 8) & 15) * 16] + pxor W16, [rsp + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define XMM_SAVE ((15-15)*16 + 1*8) +%define FRAMESZ 16*16 + XMM_SAVE +%define _XMM FRAMESZ - XMM_SAVE + +%define MOVPS movups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (4*5*4) + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define ARG1 rdi + %define ARG2 rsi +%else + ; Windows + %define ARG1 rcx + %define ARG2 rdx +%endif + +align 32 + +; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks); +; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used) +; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1 +; +; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15 +; +mk_global sha1_mb_x4_sse, function, internal +sha1_mb_x4_sse: + endbranch + + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8 + + ;; Initialize digests + movdqa A, [ARG1 + 0*16] + movdqa B, [ARG1 + 1*16] + movdqa C, [ARG1 + 2*16] + movdqa D, [ARG1 + 3*16] + movdqa E, [ARG1 + 4*16] + + ;; load input pointers + mov inp0,[ARG1 + _data_ptr + 0*8] + mov inp1,[ARG1 + _data_ptr + 1*8] + mov inp2,[ARG1 + _data_ptr + 2*8] + mov inp3,[ARG1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + movdqa F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + MOVPS T2,[inp0+IDX] + MOVPS T1,[inp1+IDX] + MOVPS T4,[inp2+IDX] + MOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + pshufb T0, F + movdqa [rsp+(I*4+0)*16],T0 + pshufb T1, F + movdqa [rsp+(I*4+1)*16],T1 + pshufb T2, F + movdqa [rsp+(I*4+2)*16],T2 + pshufb T3, F + movdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E + +;; +;; perform 0-79 steps +;; + movdqa K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + movdqa W16, [rsp + ((16 - 16) & 15) * 16] + movdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + movdqa K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + movdqa K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + movdqa K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + paddd A,AA + paddd B,BB + paddd C,CC + paddd D,DD + paddd E,EE + + sub ARG2, 1 + jne lloop + + ; write out digests + movdqa [ARG1 + 0*16], A + movdqa [ARG1 + 1*16], B + movdqa [ARG1 + 2*16], C + movdqa [ARG1 + 3*16], D + movdqa [ARG1 + 4*16], E + + ; update input pointers + add inp0, IDX + mov [ARG1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [ARG1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [ARG1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [ARG1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm new file mode 100644 index 000000000..edcba6d3f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm @@ -0,0 +1,518 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SHA1 using SSE-256 +;; outer calling routine takes care of save and restore of XMM registers + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + +;; +;; Magic functions defined in FIPS 180-1 +;; +;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; F0 = ((B & C) | ((~B) & D)) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpand %%regF, %%regB,%%regC + vpandn %%regT, %%regB,%%regD + vpor %%regF, %%regT,%%regF +%endmacro + +;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + + + +;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-%%imm) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 32)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqu W14, [rsp + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqu [rsp + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define YMM_SAVE (15-15)*32 +%define FRAMESZ 32*16 + 0*8 + YMM_SAVE +%define _YMM FRAMESZ - YMM_SAVE + +%define VMOVPS vmovups + +%define IDX rax +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define inp7 rcx + %define arg1 rdi + %define arg2 rsi + %define RSP_SAVE rdx +%else + %define inp7 rdi + %define arg1 rcx + %define arg2 rdx + %define RSP_SAVE rsi +%endif + + +; ymm0 A +; ymm1 B +; ymm2 C +; ymm3 D +; ymm4 E +; ymm5 F AA +; ymm6 T0 BB +; ymm7 T1 CC +; ymm8 T2 DD +; ymm9 T3 EE +; ymm10 T4 TMP +; ymm11 T5 FUN +; ymm12 T6 K +; ymm13 T7 W14 +; ymm14 T8 W15 +; ymm15 T9 W16 + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 + +%define F ymm5 +%define T0 ymm6 +%define T1 ymm7 +%define T2 ymm8 +%define T3 ymm9 +%define T4 ymm10 +%define T5 ymm11 +%define T6 ymm12 +%define T7 ymm13 +%define T8 ymm14 +%define T9 ymm15 + +%define AA ymm5 +%define BB ymm6 +%define CC ymm7 +%define DD ymm8 +%define EE ymm9 +%define TMP ymm10 +%define FUN ymm11 +%define K ymm12 +%define W14 ymm13 +%define W15 ymm14 +%define W16 ymm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (8*5*4) ; 8 streams x 5 32bit words per digest x 4 bytes per word + +align 32 + +; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +mk_global sha1_mb_x8_avx2, function, internal +sha1_mb_x8_avx2: + endbranch + + push RSP_SAVE + + ; save rsp + mov RSP_SAVE, rsp + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be even multiple of 8 + + ; align rsp to 32 Bytes + and rsp, ~0x1F + + ;; Initialize digests + vmovdqu A, [arg1 + 0*32] + vmovdqu B, [arg1 + 1*32] + vmovdqu C, [arg1 + 2*32] + vmovdqu D, [arg1 + 3*32] + vmovdqu E, [arg1 + 4*32] + + ;; transpose input onto stack + mov inp0,[arg1+_data_ptr+0*8] + mov inp1,[arg1+_data_ptr+1*8] + mov inp2,[arg1+_data_ptr+2*8] + mov inp3,[arg1+_data_ptr+3*8] + mov inp4,[arg1+_data_ptr+4*8] + mov inp5,[arg1+_data_ptr+5*8] + mov inp6,[arg1+_data_ptr+6*8] + mov inp7,[arg1+_data_ptr+7*8] + + xor IDX, IDX +lloop: + vmovdqu F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 2 + VMOVPS T0,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T2,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + VMOVPS T4,[inp4+IDX] + VMOVPS T5,[inp5+IDX] + VMOVPS T6,[inp6+IDX] + VMOVPS T7,[inp7+IDX] + TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + vpshufb T0, T0, F + vmovdqu [rsp+(I*8+0)*32],T0 + vpshufb T1, T1, F + vmovdqu [rsp+(I*8+1)*32],T1 + vpshufb T2, T2, F + vmovdqu [rsp+(I*8+2)*32],T2 + vpshufb T3, T3, F + vmovdqu [rsp+(I*8+3)*32],T3 + vpshufb T4, T4, F + vmovdqu [rsp+(I*8+4)*32],T4 + vpshufb T5, T5, F + vmovdqu [rsp+(I*8+5)*32],T5 + vpshufb T6, T6, F + vmovdqu [rsp+(I*8+6)*32],T6 + vpshufb T7, T7, F + vmovdqu [rsp+(I*8+7)*32],T7 + add IDX, 32 +%assign I (I+1) +%endrep + + + ; save old digests + vmovdqu AA, A + vmovdqu BB, B + vmovdqu CC, C + vmovdqu DD, D + vmovdqu EE, E + +;; +;; perform 0-79 steps +;; + vmovdqu K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqu W16, [rsp + ((16 - 16) & 15) * 32] + vmovdqu W15, [rsp + ((16 - 15) & 15) * 32] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqu K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqu K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqu K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + vmovdqu [arg1 + 0*32], A + vmovdqu [arg1 + 1*32], B + vmovdqu [arg1 + 2*32], C + vmovdqu [arg1 + 3*32], D + vmovdqu [arg1 + 4*32], E + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1+_data_ptr+0*8], inp0 + mov [arg1+_data_ptr+1*8], inp1 + mov [arg1+_data_ptr+2*8], inp2 + mov [arg1+_data_ptr+3*8], inp3 + mov [arg1+_data_ptr+4*8], inp4 + mov [arg1+_data_ptr+5*8], inp5 + mov [arg1+_data_ptr+6*8], inp6 + mov [arg1+_data_ptr+7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + mov rsp, RSP_SAVE + + pop RSP_SAVE + ret + + + +section .data align=32 + +align 32 +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c new file mode 100644 index 000000000..e778c5d98 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c @@ -0,0 +1,112 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include +#include +#include "sha1_mb.h" +#include "test.h" + +// Test messages +#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" +uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +uint8_t msg3[] = TST_STR TST_STR "0123456789:;<"; +uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?"; +uint8_t msg6[] = + TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +uint8_t msg7[] = ""; + +// Expected digests +uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 }; +uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D }; +uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 }; +uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 }; +uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 }; +uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 }; +uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 }; + +uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; +uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 }; + +int check_job(uint32_t * ref, uint32_t * good, int words) +{ + int i; + for (i = 0; i < words; i++) + if (good[i] != ref[i]) + return 1; + + return 0; +} + +#define MAX_MSGS 7 + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[MAX_MSGS]; + SHA1_HASH_CTX *p_job; + int i, checked = 0, failed = 0; + int n = sizeof(msgs) / sizeof(msgs[0]); + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + // Initialize multi-buffer manager + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < n; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)expected_digest[i]; + + p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (p_job) { // If we have finished a job, process it + checked++; + failed += + check_job(p_job->job.result_digest, p_job->user_data, + SHA1_DIGEST_NWORDS); + } + } + + // Finish remaining jobs + while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) { + checked++; + failed += + check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS); + } + + printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed); + return failed; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm new file mode 100644 index 000000000..c205f2389 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm @@ -0,0 +1,131 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define WRT_OPT wrt ..plt +%else +%define WRT_OPT +%endif + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha1_ctx_mgr_init_sse +extern sha1_ctx_mgr_submit_sse +extern sha1_ctx_mgr_flush_sse + +extern sha1_ctx_mgr_init_avx +extern sha1_ctx_mgr_submit_avx +extern sha1_ctx_mgr_flush_avx + +extern sha1_ctx_mgr_init_avx2 +extern sha1_ctx_mgr_submit_avx2 +extern sha1_ctx_mgr_flush_avx2 + +extern sha1_ctx_mgr_init_base +extern sha1_ctx_mgr_submit_base +extern sha1_ctx_mgr_flush_base + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha1_ctx_mgr_init_avx512 + extern sha1_ctx_mgr_submit_avx512 + extern sha1_ctx_mgr_flush_avx512 +%endif + +%ifdef HAVE_AS_KNOWS_SHANI + extern sha1_ctx_mgr_init_sse_ni + extern sha1_ctx_mgr_submit_sse_ni + extern sha1_ctx_mgr_flush_sse_ni +%endif + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + extern sha1_ctx_mgr_init_avx512_ni + extern sha1_ctx_mgr_submit_avx512_ni + extern sha1_ctx_mgr_flush_avx512_ni + %endif +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha1_ctx_mgr_init +mbin_interface sha1_ctx_mgr_submit +mbin_interface sha1_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6's extension through replacing base by sse version + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \ + sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \ + sha1_ctx_mgr_init_avx512, sha1_ctx_mgr_init_sse_ni, sha1_ctx_mgr_init_avx512_ni + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \ + sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \ + sha1_ctx_mgr_submit_avx512, sha1_ctx_mgr_submit_sse_ni, sha1_ctx_mgr_submit_avx512_ni + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \ + sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \ + sha1_ctx_mgr_flush_avx512, sha1_ctx_mgr_flush_sse_ni, sha1_ctx_mgr_flush_avx512_ni + %else + mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \ + sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \ + sha1_ctx_mgr_init_avx512 + mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \ + sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \ + sha1_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \ + sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \ + sha1_ctx_mgr_flush_avx512 + %endif +%else + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \ + sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_sse_ni + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \ + sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_sse_ni + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \ + sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_sse_ni + %else + mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \ + sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2 + mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \ + sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2 + mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \ + sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2 + %endif +%endif + +;;; func core, ver, snum +slversion sha1_ctx_mgr_init, 00, 04, 0148 +slversion sha1_ctx_mgr_submit, 00, 04, 0149 +slversion sha1_ctx_mgr_flush, 00, 04, 0150 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm new file mode 100644 index 000000000..86d09e303 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm @@ -0,0 +1,318 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 32 ; space for ABCDE +%define RSPSAVE rax + +%define ABCD xmm0 +; two E's b/c for ping-pong +%define E0 xmm1 +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 +%define SHUF_MASK xmm7 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define TMP r9 ; local variable -- assistant to address digest +;%define TMP2 r8 ; local variable -- assistant to address digest +align 32 + +; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r11, xmm0-xmm7 +; +mk_global sha1_ni_x1, function, internal +sha1_ni_x1: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*IDX] + ;; Initialize digest + pinsrd ABCD, [TMP + 0*NLANX4], 3 + pinsrd ABCD, [TMP + 1*NLANX4], 2 + pinsrd ABCD, [TMP + 2*NLANX4], 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCD, [TMP + 1*NLANX4], 0 + pinsrd E0, [TMP + 2*NLANX4], 3 + pand E0, [IDX3_WORD_MASK] + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; Save hash values for addition after rounds + movdqa [rsp + 0*16], E0 + movdqa [rsp + 1*16], ABCD + + ; do rounds 0-3 + movdqu MSG0, [DPTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + ; do rounds 4-7 + movdqu MSG1, [DPTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + ; do rounds 8-11 + movdqu MSG2, [DPTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 12-15 + movdqu MSG3, [DPTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + ; do rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + ; do rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + ; Add current hash values with previously saved + sha1nexte E0, [rsp + 0*16] + paddd ABCD, [rsp + 1*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*IDX] + pextrd [TMP + 0*NLANX4], ABCD, 3 + pextrd [TMP + 1*NLANX4], ABCD, 2 + pextrd [TMP + 2*NLANX4], ABCD, 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], ABCD, 0 + pextrd [TMP + 2*NLANX4], E0, 3 + + ; update input pointers + mov [MGR + _data_ptr + IDX*8], DPTR + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + + mov rsp, RSPSAVE + + ret + + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_ni_x1 +no_sha1_ni_x1: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm new file mode 100644 index 000000000..7b0ddb74e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm @@ -0,0 +1,484 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 64 ; space for ABCDE +%define RSPSAVE rax + +%define ABCD xmm0 +; two E's b/c for ping-pong +%define E0 xmm1 +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 + +%define ABCDb xmm7 +%define E0b xmm8 ; Need two E's b/c they ping pong +%define E1b xmm9 +%define MSG0b xmm10 +%define MSG1b xmm11 +%define MSG2b xmm12 +%define MSG3b xmm13 + +%define SHUF_MASK xmm14 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 + +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define DPTRb r12 ; +%define TMP r9 ; local variable -- assistant to address digest +%define TMPb r13 ; local variable -- assistant to address digest +align 32 + +; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r13, xmm0-xmm14 +; +mk_global sha1_ni_x2, function, internal +sha1_ni_x2: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + _args_digest ]; + lea TMPb,[MGR + _args_digest + 4*1]; + + ;; Initialize digest + pinsrd ABCD, [TMP + 0*NLANX4], 3 + pinsrd ABCD, [TMP + 1*NLANX4], 2 + pinsrd ABCD, [TMP + 2*NLANX4], 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCD, [TMP + 1*NLANX4], 0 + pinsrd E0, [TMP + 2*NLANX4], 3 + pand E0, [IDX3_WORD_MASK] + + pinsrd ABCDb, [TMPb + 0*NLANX4], 3 + pinsrd ABCDb, [TMPb + 1*NLANX4], 2 + pinsrd ABCDb, [TMPb + 2*NLANX4], 1 + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCDb, [TMPb + 1*NLANX4], 0 + pinsrd E0b, [TMPb + 2*NLANX4], 3 + pand E0b, [IDX3_WORD_MASK] + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr ] + mov DPTRb,[MGR + _data_ptr + 8*1] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + movdqa [rsp + 0*16], E0 + movdqa [rsp + 1*16], ABCD + + movdqa [rsp + 2*16], E0b + movdqa [rsp + 3*16], ABCDb + + ; do rounds 0-3 + movdqu MSG0, [DPTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + movdqu MSG0b, [DPTRb + 0*16] + pshufb MSG0b, SHUF_MASK + paddd E0b, MSG0b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + + ; do rounds 4-7 + movdqu MSG1, [DPTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + movdqu MSG1b, [DPTRb + 1*16] + pshufb MSG1b, SHUF_MASK + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG0b, MSG1b + + ; do rounds 8-11 + movdqu MSG2, [DPTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + movdqu MSG2b, [DPTRb + 2*16] + pshufb MSG2b, SHUF_MASK + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 12-15 + movdqu MSG3, [DPTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + movdqu MSG3b, [DPTRb + 3*16] + pshufb MSG3b, SHUF_MASK + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 3 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 3 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 3 + pxor MSG3b, MSG1b + + ; do rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 3 + + ; do rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 3 + + ; Add current hash values with previously saved + sha1nexte E0, [rsp + 0*16] + paddd ABCD, [rsp + 1*16] + + sha1nexte E0b, [rsp + 2*16] + paddd ABCDb, [rsp + 3*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + add DPTRb, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + _args_digest] + pextrd [TMP + 0*NLANX4], ABCD, 3 + pextrd [TMP + 1*NLANX4], ABCD, 2 + pextrd [TMP + 2*NLANX4], ABCD, 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], ABCD, 0 + pextrd [TMP + 2*NLANX4], E0, 3 + + lea TMPb, [MGR +_args_digest + 4*1] + pextrd [TMPb + 0*NLANX4], ABCDb, 3 + pextrd [TMPb + 1*NLANX4], ABCDb, 2 + pextrd [TMPb + 2*NLANX4], ABCDb, 1 + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMPb + 1*NLANX4], ABCDb, 0 + pextrd [TMPb + 2*NLANX4], E0b, 3 + + ; update input pointers + mov [MGR + _data_ptr], DPTR + mov [MGR + _data_ptr + 8*1], DPTRb + +backto_mgr: +;;;;;;;;;;;;;;;; +;; Postamble + + mov rsp, RSPSAVE + + ret + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_ni_x2 +no_sha1_ni_x2: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm new file mode 100644 index 000000000..aeb00a008 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm @@ -0,0 +1,485 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 +_WK_SAVE_SIZE equ 16*4 + +_WK_SAVE equ 0 +_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +; rax~rdx, rsi, rdi, rbp are used for RR +%define N_MGR r8 +%define IDX r9 ; local variable -- consistent with caller +%define K_BASE r11 +%define BUFFER_PTR r12 +%define BUFFER_END r13 +%define TMP r14 ; local variable -- assistant to address digest + +%xdefine W_TMP xmm0 +%xdefine W_TMP2 xmm9 + +%xdefine W0 xmm1 +%xdefine W4 xmm2 +%xdefine W8 xmm3 +%xdefine W12 xmm4 +%xdefine W16 xmm5 +%xdefine W20 xmm6 +%xdefine W24 xmm7 +%xdefine W28 xmm8 + +%xdefine XMM_SHUFB_BSWAP xmm10 + +;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer +%xdefine WK(t) (rsp + (t & 15)*4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Constants + +%xdefine K1 0x5a827999 +%xdefine K2 0x6ed9eba1 +%xdefine K3 0x8f1bbcdc +%xdefine K4 0xca62c1d6 + +%xdefine W_PRECALC_AHEAD 16 +%xdefine W_NO_TAIL_PRECALC 0 + +; Rounds macros + +%macro REGALLOC 0 + %xdefine A ecx + %xdefine B esi + %xdefine C edi + %xdefine D ebp + %xdefine E edx + + %xdefine T1 eax + %xdefine T2 ebx +%endmacro + +%macro F1 3 + mov T1,%2 + xor T1,%3 + and T1,%1 + xor T1,%3 +%endmacro + +%macro F2 3 + mov T1,%3 + xor T1,%2 + xor T1,%1 +%endmacro + +%macro F3 3 + mov T1,%2 + mov T2,%1 + or T1,%1 + and T2,%2 + and T1,%3 + or T1,T2 +%endmacro + +%define F4 F2 + +%macro UPDATE_HASH 2 + add %2, %1 + mov %1, %2 +%endmacro + + +%macro W_PRECALC 1 + %xdefine i (%1) + + %if (i < 20) + %xdefine K_XMM 0 + %elif (i < 40) + %xdefine K_XMM 16 + %elif (i < 60) + %xdefine K_XMM 32 + %else + %xdefine K_XMM 48 + %endif + + %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) + + %if (W_NO_TAIL_PRECALC == 0) + + %xdefine i ((%1) % 80) ;; pre-compute for the next iteration + + %if (i == 0) + W_PRECALC_RESET + %endif + + + W_PRECALC_00_15 + %endif + + %elif (i < 32) + W_PRECALC_16_31 + %elif (i < 80) ;; rounds 32-79 + W_PRECALC_32_79 + %endif +%endmacro + +%macro W_PRECALC_RESET 0 + %xdefine W W0 + %xdefine W_minus_04 W4 + %xdefine W_minus_08 W8 + %xdefine W_minus_12 W12 + %xdefine W_minus_16 W16 + %xdefine W_minus_20 W20 + %xdefine W_minus_24 W24 + %xdefine W_minus_28 W28 + %xdefine W_minus_32 W +%endmacro + +%macro W_PRECALC_ROTATE 0 + %xdefine W_minus_32 W_minus_28 + %xdefine W_minus_28 W_minus_24 + %xdefine W_minus_24 W_minus_20 + %xdefine W_minus_20 W_minus_16 + %xdefine W_minus_16 W_minus_12 + %xdefine W_minus_12 W_minus_08 + %xdefine W_minus_08 W_minus_04 + %xdefine W_minus_04 W + %xdefine W W_minus_32 +%endmacro + +%macro W_PRECALC_00_15 0 + ;; message scheduling pre-compute for rounds 0-15 + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqu W_TMP, [BUFFER_PTR + (i * 4)] + %elif ((i & 3) == 1) + pshufb W_TMP, XMM_SHUFB_BSWAP + movdqa W, W_TMP + %elif ((i & 3) == 2) + paddd W_TMP, [K_BASE] + %elif ((i & 3) == 3) + movdqa [WK(i&~3)], W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_16_31 0 + ;; message scheduling pre-compute for rounds 16-31 + ;; calculating last 32 w[i] values in 8 XMM registers + ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction + ;; + ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W, W_minus_12 + palignr W, W_minus_16, 8 ;; w[i-14] + movdqa W_TMP, W_minus_04 + psrldq W_TMP, 4 ;; w[i-3] + pxor W, W_minus_08 + %elif ((i & 3) == 1) + pxor W_TMP, W_minus_16 + pxor W, W_TMP + movdqa W_TMP2, W + movdqa W_TMP, W + pslldq W_TMP2, 12 + %elif ((i & 3) == 2) + psrld W, 31 + pslld W_TMP, 1 + por W_TMP, W + movdqa W, W_TMP2 + psrld W_TMP2, 30 + pslld W, 2 + %elif ((i & 3) == 3) + pxor W_TMP, W + pxor W_TMP, W_TMP2 + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_32_79 0 + ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W_TMP, W_minus_04 + pxor W, W_minus_28 ;; W is W_minus_32 before xor + palignr W_TMP, W_minus_08, 8 + %elif ((i & 3) == 1) + pxor W, W_minus_16 + pxor W, W_TMP + movdqa W_TMP, W + %elif ((i & 3) == 2) + psrld W, 30 + pslld W_TMP, 2 + por W_TMP, W + %elif ((i & 3) == 3) + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation + + ;; TEMP = A + ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) + ;; C = ROTATE_LEFT( B, 30 ) + ;; D = C + ;; E = D + ;; B = TEMP + + W_PRECALC (%6 + W_PRECALC_AHEAD) + F %2, %3, %4 ;; F returns result in T1 + add %5, [WK(%6)] + rol %2, 30 + mov T2, %1 + add %4, [WK(%6 + 1)] + rol T2, 5 + add %5, T1 + + W_PRECALC (%6 + W_PRECALC_AHEAD + 1) + add T2, %5 + mov %5, T2 + rol T2, 5 + add %4, T2 + F %1, %2, %3 ;; F returns result in T1 + add %4, T1 + rol %1, 30 + +;; write: %1, %2 +;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: all general regs (except r15), xmm0-xmm10 +; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} +; +mk_global sha1_opt_x1, function, internal +sha1_opt_x1: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rdi + mov [rsp + _GPR_SAVE + 8*3], rsi + ; caller has already stored XMM6~10 +%endif + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 + mov [rsp + _GPR_SAVE + 8*8], rdx + + + shl NBLK, 6 ; transform blk amount into bytes + jz .lend + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + ;; let sha1_opt sb takes over r8~r11 + ;; Load input pointers + mov N_MGR, MGR + mov BUFFER_PTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, BUFFER_PTR + mov BUFFER_END, NBLK + + lea K_BASE, [K_XMM_AR] + movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl] + + REGALLOC + + lea TMP, [N_MGR + 4*IDX] + ;; Initialize digest + mov A, [TMP + 0*NLANX4] + mov B, [TMP + 1*NLANX4] + mov C, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + mov D, [TMP + 1*NLANX4] + mov E, [TMP + 2*NLANX4] + + %assign i 0 + %rep W_PRECALC_AHEAD + W_PRECALC i + %assign i i+1 + %endrep + + %xdefine F F1 + +.lloop: + cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, + jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE + jmp .lend + +.lbegin: + RR A,B,C,D,E,0 + RR D,E,A,B,C,2 + RR B,C,D,E,A,4 + RR E,A,B,C,D,6 + RR C,D,E,A,B,8 + + RR A,B,C,D,E,10 + RR D,E,A,B,C,12 + RR B,C,D,E,A,14 + RR E,A,B,C,D,16 + RR C,D,E,A,B,18 + + %xdefine F F2 + + RR A,B,C,D,E,20 + RR D,E,A,B,C,22 + RR B,C,D,E,A,24 + RR E,A,B,C,D,26 + RR C,D,E,A,B,28 + + RR A,B,C,D,E,30 + RR D,E,A,B,C,32 + RR B,C,D,E,A,34 + RR E,A,B,C,D,36 + RR C,D,E,A,B,38 + + %xdefine F F3 + + RR A,B,C,D,E,40 + RR D,E,A,B,C,42 + RR B,C,D,E,A,44 + RR E,A,B,C,D,46 + RR C,D,E,A,B,48 + + RR A,B,C,D,E,50 + RR D,E,A,B,C,52 + RR B,C,D,E,A,54 + RR E,A,B,C,D,56 + RR C,D,E,A,B,58 + + %xdefine F F4 + + add BUFFER_PTR, 64 ;; move to next 64-byte block + cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one + cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration + + RR A,B,C,D,E,60 + RR D,E,A,B,C,62 + RR B,C,D,E,A,64 + RR E,A,B,C,D,66 + RR C,D,E,A,B,68 + + RR A,B,C,D,E,70 + RR D,E,A,B,C,72 + RR B,C,D,E,A,74 + RR E,A,B,C,D,76 + RR C,D,E,A,B,78 + + lea TMP, [N_MGR + 4*IDX] + UPDATE_HASH [TMP + 0*NLANX4],A + UPDATE_HASH [TMP + 1*NLANX4],B + UPDATE_HASH [TMP + 2*NLANX4],C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + UPDATE_HASH [TMP + 1*NLANX4],D + UPDATE_HASH [TMP + 2*NLANX4],E + + jmp .lloop + + .lend: + mov MGR, N_MGR + + mov rdx, [rsp + _GPR_SAVE + 8*8] + mov r15, [rsp + _GPR_SAVE + 8*7] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r12, [rsp + _GPR_SAVE + 8*4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + _GPR_SAVE + 8*3] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbp, [rsp + _GPR_SAVE + 8*1] + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + + +;;---------------------- +section .data align=64 + +align 128 +K_XMM_AR: + DD K1, K1, K1, K1 + DD K2, K2, K2, K2 + DD K3, K3, K3, K3 + DD K4, K4, K4, K4 + +align 16 +bswap_shufb_ctl: + DD 00010203h + DD 04050607h + DD 08090a0bh + DD 0c0d0e0fh diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c new file mode 100644 index 000000000..e82fb30fe --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c @@ -0,0 +1,220 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha1_mb.h" +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA1 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define H0 0x67452301 +#define H1 0xefcdab89 +#define H2 0x98badcfe +#define H3 0x10325476 +#define H4 0xc3d2e1f0 + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (b ^ c ^ d) +#define F3(b,c,d) ((b & c) | (d & (b | c))) +#define F4(b,c,d) (b ^ c ^ d) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void OPT_FIX sha1_single(const uint8_t * data, uint32_t digest[]); + +void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + + i = len; + while (i >= SHA1_BLOCK_SIZE) { + sha1_single(input_data, digest); + input_data += SHA1_BLOCK_SIZE; + i -= SHA1_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha1_single(buf, digest); + if (i == (2 * SHA1_BLOCK_SIZE)) + sha1_single(buf + SHA1_BLOCK_SIZE, digest); +} + +void sha1_single(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am new file mode 100644 index 000000000..9405c2469 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am @@ -0,0 +1,127 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sha256_mb/sha256_ctx_sse.c \ + sha256_mb/sha256_ctx_avx.c \ + sha256_mb/sha256_ctx_avx2.c \ + sha256_mb/sha256_ctx_base.c + +lsrc_x86_64 += sha256_mb/sha256_mb_mgr_init_sse.c \ + sha256_mb/sha256_mb_mgr_init_avx2.c + + +lsrc_x86_64 += sha256_mb/sha256_mb_mgr_submit_sse.asm \ + sha256_mb/sha256_mb_mgr_submit_avx.asm \ + sha256_mb/sha256_mb_mgr_submit_avx2.asm \ + sha256_mb/sha256_mb_mgr_flush_sse.asm \ + sha256_mb/sha256_mb_mgr_flush_avx.asm \ + sha256_mb/sha256_mb_mgr_flush_avx2.asm \ + sha256_mb/sha256_mb_x4_sse.asm \ + sha256_mb/sha256_mb_x4_avx.asm \ + sha256_mb/sha256_mb_x8_avx2.asm \ + sha256_mb/sha256_multibinary.asm + +lsrc_x86_64 += sha256_mb/sha256_ctx_avx512.c \ + sha256_mb/sha256_mb_mgr_init_avx512.c \ + sha256_mb/sha256_mb_mgr_submit_avx512.asm \ + sha256_mb/sha256_mb_mgr_flush_avx512.asm \ + sha256_mb/sha256_mb_x16_avx512.asm + +lsrc_x86_64 += sha256_mb/sha256_opt_x1.asm + +lsrc_x86_64 += sha256_mb/sha256_ni_x1.asm \ + sha256_mb/sha256_ni_x2.asm \ + sha256_mb/sha256_ctx_sse_ni.c \ + sha256_mb/sha256_ctx_avx512_ni.c \ + sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \ + sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \ + sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \ + sha256_mb/sha256_ref.c + +lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \ + sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c \ + sha256_mb/aarch64/sha256_ctx_ce.c \ + sha256_mb/aarch64/sha256_mb_mgr_ce.c \ + sha256_mb/aarch64/sha256_mb_x1_ce.S \ + sha256_mb/aarch64/sha256_mb_x2_ce.S \ + sha256_mb/aarch64/sha256_mb_x3_ce.S \ + sha256_mb/aarch64/sha256_mb_x4_ce.S + + +lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c \ + sha256_mb/sha256_ctx_base.c \ + sha256_mb/sha256_ref.c + +src_include += -I $(srcdir)/sha256_mb + +extern_hdrs += include/sha256_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + sha256_mb/sha256_job.asm \ + sha256_mb/sha256_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha256_mb/sha256_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha256_mb/sha256_mb_test \ + sha256_mb/sha256_mb_rand_test \ + sha256_mb/sha256_mb_rand_update_test \ + sha256_mb/sha256_mb_flush_test + +unit_tests += sha256_mb/sha256_mb_rand_ssl_test + +perf_tests += sha256_mb/sha256_mb_vs_ossl_perf \ + sha256_mb/sha256_mb_vs_ossl_shortage_perf + +sha256_mb_rand_ssl_test: sha256_ref.o +sha256_mb_rand_test: sha256_ref.o +sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_rand_update_test: sha256_ref.o +sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_flush_test: sha256_ref.o +sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_rand_ssl_test: LDLIBS += -lcrypto +sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c new file mode 100644 index 000000000..4776f55bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state); +SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job); +SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_ce(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_ce_slver_02020142; +struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_submit_ce_slver_02020143; +struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_flush_ce_slver_02020144; +struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..8627991c3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c @@ -0,0 +1,59 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_submit_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_init_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_flush_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c new file mode 100644 index 000000000..aa63c4dd8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA256_MB_CE_MAX_LANES 3 + +#if SHA256_MB_CE_MAX_LANES >=4 +void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +#endif +#if SHA256_MB_CE_MAX_LANES >=3 +void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +#endif +#if SHA256_MB_CE_MAX_LANES >=2 +void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int); +#endif +void sha256_mb_ce_x1(SHA256_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state) +{ + int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SHA1_MB_CE_MAX_LANES is invalid lane + for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SHA256_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SHA256_MB_CE_MAX_LANES == 4 + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + + } else +#elif SHA256_MB_CE_MAX_LANES == 3 + if (state->num_lanes_inuse == 3) { + len = min(min(state->lens[0], state->lens[1]), state->lens[2]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x3(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, len >> 4); + + } else +#elif SHA256_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); +#if SHA256_MB_CE_MAX_LANES >=4 + if (lanes == 4) { + sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, len >> 4); + + } else +#endif +#if SHA256_MB_CE_MAX_LANES >=3 + if (lanes == 3) { + sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, len >> 4); + } else +#endif +#if SHA256_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SHA256_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state) +{ + int i; + SHA256_JOB *ret = NULL; + + for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SHA256_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA256_JOB *ret; + + //add job into lanes + sha256_mb_mgr_insert_job(state, job); + + ret = sha256_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha256_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha256_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sha256_mb_mgr_free_lane(state); + return ret; +} + +SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state) +{ + SHA256_JOB *ret; + ret = sha256_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha256_mb_mgr_do_jobs(state); + return sha256_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S new file mode 100644 index 000000000..ecc5fc5f5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include + + +mbin_interface sha256_ctx_mgr_submit +mbin_interface sha256_ctx_mgr_init +mbin_interface sha256_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S new file mode 100644 index 000000000..06d0ab5fa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S @@ -0,0 +1,238 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l0_abcd_saved,5 + declare_var_vector_reg l0_efgh_saved,6 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,2 + declare_var_vector_reg l0_tmp1,3 + declare_var_vector_reg l0_tmp2,4 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + + + +/* + void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len); +*/ +/* +Arguements list +*/ + l0_job .req x0 + len .req w1 + l0_data .req x2 + tmp .req x3 + .global sha256_mb_ce_x1 + .type sha256_mb_ce_x1, %function +sha256_mb_ce_x1: + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + + + +start_loop: + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + + + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + + + bgt start_loop + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + + ret + + .size sha256_mb_ce_x1, .-sha256_mb_ce_x1 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S new file mode 100644 index 000000000..dadf44bb0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S @@ -0,0 +1,289 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + mov l1_tmp2_v.16b,l1_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l0_abcd_saved,2 + declare_var_vector_reg l0_efgh_saved,3 + declare_var_vector_reg l1_abcd,4 + declare_var_vector_reg l1_efgh,5 + declare_var_vector_reg l1_abcd_saved,6 + declare_var_vector_reg l1_efgh_saved,7 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,8 + declare_var_vector_reg l0_tmp1,9 + declare_var_vector_reg l0_tmp2,10 + declare_var_vector_reg l1_tmp0,11 + declare_var_vector_reg l1_tmp1,12 + declare_var_vector_reg l1_tmp2,13 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + + + +/* + void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + len .req w2 + l0_data .req x3 + l1_data .req x4 + tmp .req x5 + .global sha256_mb_ce_x2 + .type sha256_mb_ce_x2, %function +sha256_mb_ce_x2: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + mov l1_abcd_saved_v.16b,l1_abcd_v.16b + mov l1_efgh_saved_v.16b,l1_efgh_v.16b + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + mov l1_tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s + + + + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + + + bgt start_loop + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x2, .-sha256_mb_ce_x2 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S new file mode 100644 index 000000000..6ed1591ba --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S @@ -0,0 +1,342 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + mov l1_tmp2_v.16b,l1_abcd_v.16b + mov l2_tmp2_v.16b,l2_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s + add l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s + sha256h l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s + sha256h2 l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s + sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l1_abcd,2 + declare_var_vector_reg l1_efgh,3 + declare_var_vector_reg l2_abcd,4 + declare_var_vector_reg l2_efgh,5 + declare_var_vector_reg l1_abcd_saved,16 + declare_var_vector_reg l1_efgh_saved,17 + declare_var_vector_reg l0_abcd_saved,20 + declare_var_vector_reg l0_efgh_saved,21 + declare_var_vector_reg l2_abcd_saved,24 + declare_var_vector_reg l2_efgh_saved,25 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,6 + declare_var_vector_reg l0_tmp1,7 + declare_var_vector_reg l0_tmp2,8 + declare_var_vector_reg l1_tmp0,9 + declare_var_vector_reg l1_tmp1,10 + declare_var_vector_reg l1_tmp2,11 + declare_var_vector_reg l2_tmp0,12 + declare_var_vector_reg l2_tmp1,13 + declare_var_vector_reg l2_tmp2,14 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + declare_var_vector_reg l2_msg0,24 + declare_var_vector_reg l2_msg1,25 + declare_var_vector_reg l2_msg2,26 + declare_var_vector_reg l2_msg3,27 + + + +/* + void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + l2_job .req x2 + len .req w3 + l0_data .req x4 + l1_data .req x5 + l2_data .req x6 + tmp .req x7 + .global sha256_mb_ce_x3 + .type sha256_mb_ce_x3, %function +sha256_mb_ce_x3: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + ldr l2_data, [l2_job] + ldr l2_abcd_q, [l2_job, 64] + ldr l2_efgh_q, [l2_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + add l2_data,l2_data,64 + sub len, len, #1 + cmp len, 0 +/* + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + mov l1_abcd_saved_v.16b,l1_abcd_v.16b + mov l1_efgh_saved_v.16b,l1_efgh_v.16b + mov l2_abcd_saved_v.16b,l2_abcd_v.16b + mov l2_efgh_saved_v.16b,l2_efgh_v.16b +*/ + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + rev32 l2_msg0_v.16b,l2_msg0_v.16b + rev32 l2_msg1_v.16b,l2_msg1_v.16b + add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s + rev32 l2_msg2_v.16b,l2_msg2_v.16b + rev32 l2_msg3_v.16b,l2_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + + /* msg0 msg1 is free , share with digest regs */ + ldr l0_abcd_saved_q, [l0_job, 64] + ldr l1_abcd_saved_q, [l1_job, 64] + ldr l2_abcd_saved_q, [l2_job, 64] + ldr l0_efgh_saved_q, [l0_job, 80] + ldr l1_efgh_saved_q, [l1_job, 80] + ldr l2_efgh_saved_q, [l2_job, 80] + + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + mov l1_tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s + + mov l2_tmp2_v.16b,l2_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s + sha256h2 l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s + + /* combine state */ + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s + add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s + + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + str l2_abcd_q, [l2_job, 64] + str l2_efgh_q, [l2_job, 80] + + bgt start_loop + + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x3, .-sha256_mb_ce_x3 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S new file mode 100644 index 000000000..b1686ada1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S @@ -0,0 +1,380 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +tmp0 : in +tmp1 : out +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req + ldr key_q , [tmp] + mov tmp0_v.16b,l0_\tmp0\()_v.16b + mov tmp1_v.16b,l1_\tmp0\()_v.16b + add l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s + mov tmp2_v.16b,l0_abcd_v.16b + mov tmp3_v.16b,l1_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,tmp0_v.4s + sha256h l1_abcd_q,l1_efgh_q,tmp1_v.4s + sha256h2 l0_efgh_q,tmp2_q,tmp0_v.4s + sha256h2 l1_efgh_q,tmp3_q,tmp1_v.4s + + ldr key_q , [tmp] + mov tmp0_v.16b,l2_\tmp0\()_v.16b + mov tmp1_v.16b,l3_\tmp0\()_v.16b + add tmp,tmp,16 + add l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s + add l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s + mov tmp2_v.16b,l2_abcd_v.16b + mov tmp3_v.16b,l3_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,tmp0_v.4s + sha256h l3_abcd_q,l3_efgh_q,tmp1_v.4s + sha256h2 l2_efgh_q,tmp2_q,tmp0_v.4s + sha256h2 l3_efgh_q,tmp3_q,tmp1_v.4s + + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s + sha256su0 l3_\msg0\()_v.4s,l3_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s + sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s + sha256su1 l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,15 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l1_abcd,2 + declare_var_vector_reg l1_efgh,3 + declare_var_vector_reg l2_abcd,4 + declare_var_vector_reg l2_efgh,5 + declare_var_vector_reg l3_abcd,6 + declare_var_vector_reg l3_efgh,7 + declare_var_vector_reg l1_abcd_saved,16 + declare_var_vector_reg l1_efgh_saved,17 + declare_var_vector_reg l0_abcd_saved,20 + declare_var_vector_reg l0_efgh_saved,21 + declare_var_vector_reg l2_abcd_saved,24 + declare_var_vector_reg l2_efgh_saved,25 + declare_var_vector_reg l3_abcd_saved,28 + declare_var_vector_reg l3_efgh_saved,29 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,8 + declare_var_vector_reg l1_tmp0,9 + declare_var_vector_reg l2_tmp0,10 + declare_var_vector_reg l3_tmp0,11 + + declare_var_vector_reg tmp0,12 + declare_var_vector_reg tmp1,13 + declare_var_vector_reg tmp2,14 + declare_var_vector_reg tmp3,15 + +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + declare_var_vector_reg l2_msg0,24 + declare_var_vector_reg l2_msg1,25 + declare_var_vector_reg l2_msg2,26 + declare_var_vector_reg l2_msg3,27 + declare_var_vector_reg l3_msg0,28 + declare_var_vector_reg l3_msg1,29 + declare_var_vector_reg l3_msg2,30 + declare_var_vector_reg l3_msg3,31 + + + +/* + void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + l2_job .req x2 + l3_job .req x3 + len .req w4 + l0_data .req x5 + l1_data .req x6 + l2_data .req x7 + l3_data .req x8 + tmp .req x9 + .global sha256_mb_ce_x4 + .type sha256_mb_ce_x4, %function +sha256_mb_ce_x4: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + ldr l2_data, [l2_job] + ldr l2_abcd_q, [l2_job, 64] + ldr l2_efgh_q, [l2_job, 80] + ldr l3_data, [l3_job] + ldr l3_abcd_q, [l3_job, 64] + ldr l3_efgh_q, [l3_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data] + ld1 {l3_msg0_v.4s-l3_msg3_v.4s},[l3_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + add l2_data,l2_data,64 + add l3_data,l3_data,64 + sub len, len, #1 + cmp len, 0 + + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + rev32 l2_msg0_v.16b,l2_msg0_v.16b + rev32 l2_msg1_v.16b,l2_msg1_v.16b + add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s + rev32 l2_msg2_v.16b,l2_msg2_v.16b + rev32 l2_msg3_v.16b,l2_msg3_v.16b + + rev32 l3_msg0_v.16b,l3_msg0_v.16b + rev32 l3_msg1_v.16b,l3_msg1_v.16b + add l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s + rev32 l3_msg2_v.16b,l3_msg2_v.16b + rev32 l3_msg3_v.16b,l3_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + + + + sha256_4_rounds_high msg1,tmp0 /* rounds 48-51 */ + + /* msg0 msg1 is free , share with digest regs */ + ldr l0_abcd_saved_q, [l0_job, 64] + ldr l1_abcd_saved_q, [l1_job, 64] + ldr l2_abcd_saved_q, [l2_job, 64] + ldr l3_abcd_saved_q, [l3_job, 64] + ldr l0_efgh_saved_q, [l0_job, 80] + ldr l1_efgh_saved_q, [l1_job, 80] + ldr l2_efgh_saved_q, [l2_job, 80] + ldr l3_efgh_saved_q, [l3_job, 80] + + sha256_4_rounds_high msg2,tmp0 + sha256_4_rounds_high msg3,tmp0 + + /* rounds 60-63 */ + mov tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s + sha256h2 l0_efgh_q,tmp2_q,l0_tmp0_v.4s + + mov tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s + sha256h2 l1_efgh_q,tmp2_q,l1_tmp0_v.4s + + mov tmp2_v.16b,l2_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s + sha256h2 l2_efgh_q,tmp2_q,l2_tmp0_v.4s + + mov tmp2_v.16b,l3_abcd_v.16b + sha256h l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s + sha256h2 l3_efgh_q,tmp2_q,l3_tmp0_v.4s + + /* combine state */ + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s + add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s + add l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s + add l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s + + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + str l2_abcd_q, [l2_job, 64] + str l2_efgh_q, [l2_job, 80] + str l3_abcd_q, [l3_job, 64] + str l3_efgh_q, [l3_job, 80] + + bgt start_loop + + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x4, .-sha256_mb_ce_x4 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c new file mode 100644 index 000000000..12441a8e3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c @@ -0,0 +1,268 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx_slver_02020154; +struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_submit_avx_slver_02020155; +struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_flush_avx_slver_02020156; +struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c new file mode 100644 index 000000000..9c045659e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c @@ -0,0 +1,268 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx2_slver_04020157; +struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 }; + +struct slver sha256_ctx_mgr_submit_avx2_slver_04020158; +struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 }; + +struct slver sha256_ctx_mgr_flush_avx2_slver_04020159; +struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c new file mode 100644 index 000000000..a1f068987 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c @@ -0,0 +1,273 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx512_slver_0600015a; +struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 }; + +struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b; +struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 }; + +struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c; +struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c new file mode 100644 index 000000000..763057f12 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c @@ -0,0 +1,283 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +/** + * sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake. + * Since SHANI is still slower than multibuffer for full lanes, + * sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are + * similare with their avx512 versions. + * sha256_ctx_mgr_flush_avx512_ni is different. It will call + * sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less + * than a threshold. + * + */ +#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI) + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca; +struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 }; + +struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb; +struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 }; + +struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc; +struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 }; + +#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c new file mode 100644 index 000000000..58bf024a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c @@ -0,0 +1,301 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define W(x) w[(x) & 15] + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sha256_single(const void *data, uint32_t digest[]); +static inline void hash_init_digest(SHA256_WORD_T * digest); + +void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr) +{ +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + sha256_init(ctx, buffer, len); + sha256_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + sha256_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sha256_update(ctx, buffer, len); + sha256_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sha256_init(ctx, buffer, len); + remain_len = sha256_update(ctx, buffer, len); + sha256_final(ctx, remain_len); + } + + return ctx; +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SHA256_BLOCK_SIZE) { + sha256_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE); + remain_len -= SHA256_BLOCK_SIZE; + ctx->total_length += SHA256_BLOCK_SIZE; + } + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sha256_single(buf, digest); + if (i == 2 * SHA256_BLOCK_SIZE) { + sha256_single(buf + SHA256_BLOCK_SIZE, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +void sha256_single(const void *data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_base_slver_000002f0; +struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 }; + +struct slver sha256_ctx_mgr_submit_base_slver_000002f1; +struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 }; + +struct slver sha256_ctx_mgr_flush_base_slver_000002f2; +struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c new file mode 100644 index 000000000..1483f631c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include "sha256_mb.h" +#include "memcpy_inline.h" + +extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr); +extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags); +extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr); + +void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr) +{ + return sha256_ctx_mgr_init_base(mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr) +{ + return sha256_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c new file mode 100644 index 000000000..f85f5c88b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_sse(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_sse_slver_00020151; +struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 }; + +struct slver sha256_ctx_mgr_submit_sse_slver_00020152; +struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 }; + +struct slver sha256_ctx_mgr_flush_sse_slver_00020153; +struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c new file mode 100644 index 000000000..e2c7e2738 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c @@ -0,0 +1,262 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_SHANI + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr) +{ + // Same with sse + sha256_mb_mgr_init_sse(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7; +struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 }; + +struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8; +struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 }; + +struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9; +struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 }; + +#endif // HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm new file mode 100644 index 000000000..f9fb6d230 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm @@ -0,0 +1,65 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SHA256_SB_THRESHOLD_SSE 1 +%define SHA256_SB_THRESHOLD_AVX 1 +%define SHA256_SB_THRESHOLD_AVX2 1 +%define SHA256_SB_THRESHOLD_AVX512 1 +%define SHA256_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb +%define SHA256_NI_SB_THRESHOLD_AVX512 6 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SHA256_JOB_size _FIELD_OFFSET +%assign _SHA256_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c new file mode 100644 index 000000000..28f1f5118 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c @@ -0,0 +1,146 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SHA256_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SHA256_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SHA256_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sha256_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sha256_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else if (num_unchanged) + printf("SHA-NI is used when %d or %d jobs are uncompleted\n", + num_unchanged, num_unchanged + 1); + else + printf("SHA-NI is not used, or used for last job\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm new file mode 100644 index 000000000..ebba9ca36 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm @@ -0,0 +1,74 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_ARGS_X16 +;;; name size align +FIELD _digest, 4*8*16, 4 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA256_ARGS_X4_size _FIELD_OFFSET +%assign _SHA256_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA256_ARGS_X8_size _FIELD_OFFSET +%assign _SHA256_ARGS_X8_align _STRUCT_ALIGN +%assign _SHA256_ARGS_X16_size _FIELD_OFFSET +%assign _SHA256_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..69f27f42d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm @@ -0,0 +1,253 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_avx +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx, function +sha256_mb_mgr_flush_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..0ee0589cf --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm @@ -0,0 +1,274 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x8_avx2 +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx2, function +sha256_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..201cd42b0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm @@ -0,0 +1,288 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha256_mb_x16_avx512 +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1 +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx512, function +sha256_mb_mgr_flush_avx512: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_mgr_flush_avx512 +no_sha256_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm new file mode 100644 index 000000000..7bc9d32a4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm @@ -0,0 +1,295 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + +extern sha256_mb_x16_avx512 +extern sha256_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1 +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx512_ni, function +sha256_mb_mgr_flush_avx512_ni: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + + %else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_avx512_ni + no_sha256_mb_mgr_flush_avx512_ni: + %endif + %endif ; HAVE_AS_KNOWS_SHANI +%else +%ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_avx512_ni + no_sha256_mb_mgr_flush_avx512_ni: + %endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..69ae4bad5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm @@ -0,0 +1,254 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_sse +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_sse, function +sha256_mb_mgr_flush_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm new file mode 100644 index 000000000..43b8fcbe4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha256_mb_x4_sse +extern sha256_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_sse_ni, function +sha256_mb_mgr_flush_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_sse_ni + no_sha256_mb_mgr_flush_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c new file mode 100644 index 000000000..903fb733b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c new file mode 100644 index 000000000..b875735f9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c new file mode 100644 index 000000000..cf22c4aee --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF3210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_MIN_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..cb7d5790a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm @@ -0,0 +1,260 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_avx + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*5 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx, function +sha256_mb_mgr_submit_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..af2fc89ea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "memcpy.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx2, function +sha256_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..cdc477370 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "memcpy.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha256_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 +%define num_lanes_inuse r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx512, function +sha256_mb_mgr_submit_avx512: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_mgr_submit_avx512 +no_sha256_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..b1bbc7002 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_sse + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*5 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_sse, function +sha256_mb_mgr_submit_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + movd [state + _args_digest + 4*lane + 4*16], xmm1 + pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm new file mode 100644 index 000000000..cb1dce641 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm @@ -0,0 +1,301 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha256_mb_x4_sse +extern sha256_ni_x2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*7 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_sse_ni, function +sha256_mb_mgr_submit_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 + mov [rsp + _XMM_SAVE + 8*5], r13 + mov [rsp + _XMM_SAVE + 8*6], r14 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + movd [state + _args_digest + 4*lane + 4*16], xmm1 + pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + + cmp unused_lanes, 0xF32 ; we will process two jobs at the same time + jne return_null ; wait for another sha_ni job + + ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func + %if SHA256_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb + ; shani glue code + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + ; lensN-len2=idx + sub lens0, len2 + sub lens1, len2 + + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x2 + ; state and idx are intact + %else + ; original mb code + cmp unused_lanes, 0xF + jne return_null + + start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + %endif +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov r13, [rsp + _XMM_SAVE + 8*5] + mov r14, [rsp + _XMM_SAVE + 8*6] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_submit_sse_ni + no_sha256_mb_mgr_submit_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c new file mode 100644 index 000000000..768bfca78 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c @@ -0,0 +1,160 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha256_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA256(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c new file mode 100644 index 000000000..adba77f3d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c @@ -0,0 +1,203 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha256_mb test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha256_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c new file mode 100644 index 000000000..9535d80df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c @@ -0,0 +1,300 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA256_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA256_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha256 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha256_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha256_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha256_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha256_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha256_ctx_mgr_init(mgr); + + // Run sha256_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA256_BLOCK_SIZE + + SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA256_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, + HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha256_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha256_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA256_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha256_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c new file mode 100644 index 000000000..8a5b5a9b2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c @@ -0,0 +1,241 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha256_mb.h" + +typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS]; + +#define MSGS 7 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +static uint8_t msg3[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<"; +static uint8_t msg4[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +static uint8_t msg5[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?"; +static uint8_t msg6[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +static uint8_t msg7[] = ""; + +static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039, + 0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1 +}; + +static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6, + 0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD +}; + +static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619, + 0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3 +}; + +static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407, + 0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383 +}; + +static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD, + 0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA +}; + +static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F, + 0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE +}; + +static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924, + 0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855 +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7 +}; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sha256_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha256_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha256 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c new file mode 100644 index 000000000..51759d7a8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c @@ -0,0 +1,129 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha256_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 4000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 20 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha256_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha256_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha256_ctx_mgr_submit(mgr, + &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha256_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha256" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha256 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..235ec74a8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,132 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha256_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SHA256_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha256_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha256_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sha256_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sha256 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm new file mode 100644 index 000000000..f45669c6e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm @@ -0,0 +1,930 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;; code to compute oct SHA256 using SSE-256 / AVX512 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, 8*64, 64 +FIELD _rsp, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 +%define TBL var2 + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 +;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 +;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 +;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 + +; Main processing loop per round +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddd T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, T1, %%WT ; T1 = T1 + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpaddd D, D, T1 ; D = D + T1 + + vprord H, A, 2 ; ROR_2(A) + vprord TMP2, A, 13 ; ROR_13(A) + vprord TMP3, A, 22 ; ROR_22(A) + vmovdqa32 TMP0, A + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +; This is supposed to be SKL optimized assuming: +; vpternlog, vpaddd ports 5,8 +; vprord ports 1,8 +; However, vprord is only working on port 8 +; +; Main processing loop per round +; Get the msg schedule word 16 from the current, now unneccessary word +%macro PROCESS_LOOP_00_47 5 +%define %%WT %1 +%define %%ROUND %2 +%define %%WTp1 %3 +%define %%WTp9 %4 +%define %%WTp14 %5 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + ;; For next value in msg schedule + ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt + + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, H, %%WT ; T1 = H + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP6 ; T1 = T1 + Kt + vprord H, A, 2 ; ROR_2(A) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vprord TMP2, A, 13 ; ROR_13(A) + vmovdqa32 TMP0, A + vprord TMP3, A, 22 ; ROR_22(A) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpaddd D, D, T1 ; D = D + T1 + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + + + vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_63 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +align 64 + +;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha256_mb_x16_avx512) +sha256_mb_x16_avx512: + endbranch + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _rsp], rax + lea TBL, [TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + vmovups F, [DIGEST + 5*64] + vmovups G, [DIGEST + 6*64] + vmovups H, [DIGEST + 7*64] + + ; Do we need to transpose digests??? + ; SHA1 does not, but SHA256 has been + + xor IDX, IDX + + ;; Read in first block of input data + ;; Transpose input data + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + vmovups W0,[inp0+IDX] + vmovups W1,[inp1+IDX] + vmovups W2,[inp2+IDX] + vmovups W3,[inp3+IDX] + vmovups W4,[inp4+IDX] + vmovups W5,[inp5+IDX] + vmovups W6,[inp6+IDX] + vmovups W7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups W8, [inp0+IDX] + vmovups W9, [inp1+IDX] + vmovups W10,[inp2+IDX] + vmovups W11,[inp3+IDX] + vmovups W12,[inp4+IDX] + vmovups W13,[inp5+IDX] + vmovups W14,[inp6+IDX] + vmovups W15,[inp7+IDX] + + +lloop: + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + vmovdqa32 TMP3, [TBL] ; First K + + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 64 + + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + +%assign I 0 +%rep 16 + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first 48 rounds + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep 48 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + + ; Check is this is the last block + sub SIZE, 1 + je lastLoop + + ; Process last 16 rounds + ; Read in next block msg data for use in first 16 words of msg sched +%assign I 48 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_00_15 APPEND(W,J), J +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] + + jmp lloop + +lastLoop: + ; Process last 16 rounds +%assign I 48 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + ; Write out digest + ; Do we need to untranspose digests??? + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + vmovups [DIGEST + 5*64], F + vmovups [DIGEST + 6*64], G + vmovups [DIGEST + 7*64], H + + + mov rsp, [rsp + _rsp] + ret + + section .data +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_x16_avx512 +no_sha256_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm new file mode 100644 index 000000000..7f8f8829b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm @@ -0,0 +1,431 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA256 using AVX +;; Logic designed/laid out by JDG + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + +%define TABLE K256_4_MB +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +%define DIGEST_SIZE 8*SZ4 +%define DATA 16*SZ4 +%define ALIGNMENT 1*8 +; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8 +%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT) +%define _DIGEST (DATA) + +%define VMOVPS vmovups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define IDX rax +%define ROUND rbx +%define TBL r12 + +;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len); +;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used) +;; arg 2 : arg2 : size of data in blocks (assumed >= 1) +;; +;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15 +;; +mk_global sha256_mb_x4_avx, function, internal +align 32 +sha256_mb_x4_avx: + endbranch + sub rsp, FRAMESZ + + ;; Initialize digests + vmovdqa a,[arg1+0*SZ4] + vmovdqa b,[arg1+1*SZ4] + vmovdqa c,[arg1+2*SZ4] + vmovdqa d,[arg1+3*SZ4] + vmovdqa e,[arg1+4*SZ4] + vmovdqa f,[arg1+5*SZ4] + vmovdqa g,[arg1+6*SZ4] + vmovdqa h,[arg1+7*SZ4] + + lea TBL,[TABLE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPS TT2,[inp0+IDX+i*16] + VMOVPS TT1,[inp1+IDX+i*16] + VMOVPS TT4,[inp2+IDX+i*16] + VMOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ4] + vpaddd b, b, [rsp + _DIGEST + 1*SZ4] + vpaddd c, c, [rsp + _DIGEST + 2*SZ4] + vpaddd d, d, [rsp + _DIGEST + 3*SZ4] + vpaddd e, e, [rsp + _DIGEST + 4*SZ4] + vpaddd f, f, [rsp + _DIGEST + 5*SZ4] + vpaddd g, g, [rsp + _DIGEST + 6*SZ4] + vpaddd h, h, [rsp + _DIGEST + 7*SZ4] + + + sub arg2, 1 + jne lloop + + ; write digests out + vmovdqa [arg1+0*SZ4],a + vmovdqa [arg1+1*SZ4],b + vmovdqa [arg1+2*SZ4],c + vmovdqa [arg1+3*SZ4],d + vmovdqa [arg1+4*SZ4],e + vmovdqa [arg1+5*SZ4],f + vmovdqa [arg1+6*SZ4],g + vmovdqa [arg1+7*SZ4],h + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [arg1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [arg1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [arg1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + ret + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm new file mode 100644 index 000000000..2d349abbc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm @@ -0,0 +1,426 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA256 using SSE +;; Logic designed/laid out by JDG + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + +%define TABLE K256_4_MB +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + rsp],%%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + + movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp] + paddd a1, [SZ4*((%%i-7)&0xf) + rsp] + paddd %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +%define DIGEST_SIZE 8*SZ4 +%define DATA 16*SZ4 +%define ALIGNMENT 1*8 +; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8 +%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT) +%define _DIGEST (DATA) + +%define MOVPS movups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define IDX rax +%define ROUND rbx +%define TBL r12 + +;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len); +;; arg 1 : pointer args (only 4 of the 8 lanes used) +;; arg 2 : size of data in blocks (assumed >= 1) +;; +;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15 +;; + +mk_global sha256_mb_x4_sse, function, internal +align 32 +sha256_mb_x4_sse: + endbranch + sub rsp, FRAMESZ + + ;; Initialize digests + movdqa a,[arg1+0*SZ4] + movdqa b,[arg1+1*SZ4] + movdqa c,[arg1+2*SZ4] + movdqa d,[arg1+3*SZ4] + movdqa e,[arg1+4*SZ4] + movdqa f,[arg1+5*SZ4] + movdqa g,[arg1+6*SZ4] + movdqa h,[arg1+7*SZ4] + + lea TBL,[TABLE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ4], a + movdqa [rsp + _DIGEST + 1*SZ4], b + movdqa [rsp + _DIGEST + 2*SZ4], c + movdqa [rsp + _DIGEST + 3*SZ4], d + movdqa [rsp + _DIGEST + 4*SZ4], e + movdqa [rsp + _DIGEST + 5*SZ4], f + movdqa [rsp + _DIGEST + 6*SZ4], g + movdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + MOVPS TT2,[inp0+IDX+i*16] + MOVPS TT1,[inp1+IDX+i*16] + MOVPS TT4,[inp2+IDX+i*16] + MOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + pshufb TT0, TMP + pshufb TT1, TMP + pshufb TT2, TMP + pshufb TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddd a, [rsp + _DIGEST + 0*SZ4] + paddd b, [rsp + _DIGEST + 1*SZ4] + paddd c, [rsp + _DIGEST + 2*SZ4] + paddd d, [rsp + _DIGEST + 3*SZ4] + paddd e, [rsp + _DIGEST + 4*SZ4] + paddd f, [rsp + _DIGEST + 5*SZ4] + paddd g, [rsp + _DIGEST + 6*SZ4] + paddd h, [rsp + _DIGEST + 7*SZ4] + + + sub arg2, 1 + jne lloop + + ; write digests out + movdqa [arg1+0*SZ4],a + movdqa [arg1+1*SZ4],b + movdqa [arg1+2*SZ4],c + movdqa [arg1+3*SZ4],d + movdqa [arg1+4*SZ4],e + movdqa [arg1+5*SZ4],f + movdqa [arg1+6*SZ4],g + movdqa [arg1+7*SZ4],h + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [arg1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [arg1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [arg1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + ret + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm new file mode 100644 index 000000000..dbd9db1b8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm @@ -0,0 +1,620 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SHA256 using SSE-256 / AVX2 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx + %define reg3 rsi + %define reg4 rdi +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL reg3 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 reg4 + +; ymm0 a +; ymm1 b +; ymm2 c +; ymm3 d +; ymm4 e +; ymm5 f +; ymm6 g TMP0 +; ymm7 h TMP1 +; ymm8 T1 TT0 +; ymm9 TT1 +; ymm10 TT2 +; ymm11 TT3 +; ymm12 a0 TT4 +; ymm13 a1 TT5 +; ymm14 a2 TT6 +; ymm15 TMP TT7 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define T1 ymm8 + +%define a0 ymm12 +%define a1 ymm13 +%define a2 ymm14 +%define TMP ymm15 + +%define TMP0 ymm6 +%define TMP1 ymm7 + +%define TT0 ymm8 +%define TT1 ymm9 +%define TT2 ymm10 +%define TT3 ymm11 +%define TT4 ymm12 +%define TT5 ymm13 +%define TT6 ymm14 +%define TT7 ymm15 + +%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ8 +%define PTR_SZ 8 +%define SHA256_DIGEST_WORD_SIZE 4 +%define MAX_SHA256_LANES 8 +%define NUM_SHA256_DIGEST_WORDS 8 +%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE) + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ8 + .digest resb 8*SZ8 + .ytmp resb 4*SZ8 + .rsp resb 8 +endstruc +%define FRAMESZ stack_frame_size +%define _DIGEST stack_frame.digest +%define _YTMP stack_frame.ytmp +%define _RSP_SAVE stack_frame.rsp + +%define YTMP0 rsp + _YTMP + 0*SZ8 +%define YTMP1 rsp + _YTMP + 1*SZ8 +%define YTMP2 rsp + _YTMP + 2*SZ8 +%define YTMP3 rsp + _YTMP + 3*SZ8 + +%define VMOVPS vmovups + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp] + vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + +;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes); +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of input in blocks +mk_global sha256_mb_x8_avx2, function, internal +align 16 +sha256_mb_x8_avx2: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + ; save rsp, allocate 32-byte aligned for local variables + mov IDX, rsp + sub rsp, FRAMESZ + and rsp, ~31 + mov [rsp + _RSP_SAVE], IDX + + + ;; Load the pre-transposed incoming digest. + vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE] + vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE] + vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE] + vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE] + vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE] + vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE] + vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE] + vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE] + + lea TBL,[K256_8_MB] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ] + mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ] + mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ] + mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ] + mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ] + mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ] + mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ] + mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ8], a + vmovdqa [rsp + _DIGEST + 1*SZ8], b + vmovdqa [rsp + _DIGEST + 2*SZ8], c + vmovdqa [rsp + _DIGEST + 3*SZ8], d + vmovdqa [rsp + _DIGEST + 4*SZ8], e + vmovdqa [rsp + _DIGEST + 5*SZ8], f + vmovdqa [rsp + _DIGEST + 6*SZ8], g + vmovdqa [rsp + _DIGEST + 7*SZ8], h +%assign i 0 +%rep 2 + VMOVPS TT0,[inp0+IDX+i*32] + VMOVPS TT1,[inp1+IDX+i*32] + VMOVPS TT2,[inp2+IDX+i*32] + VMOVPS TT3,[inp3+IDX+i*32] + VMOVPS TT4,[inp4+IDX+i*32] + VMOVPS TT5,[inp5+IDX+i*32] + VMOVPS TT6,[inp6+IDX+i*32] + VMOVPS TT7,[inp7+IDX+i*32] + vmovdqa [YTMP0], g + vmovdqa [YTMP1], h + TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 + vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK] + vmovdqa g, [YTMP0] + vpshufb TT0, TT0, TMP1 + vpshufb TT1, TT1, TMP1 + vpshufb TT2, TT2, TMP1 + vpshufb TT3, TT3, TMP1 + vpshufb TT4, TT4, TMP1 + vpshufb TT5, TT5, TMP1 + vpshufb TT6, TT6, TMP1 + vpshufb TT7, TT7, TMP1 + vmovdqa h, [YTMP1] + vmovdqa [YTMP0], TT4 + vmovdqa [YTMP1], TT5 + vmovdqa [YTMP2], TT6 + vmovdqa [YTMP3], TT7 + ROUND_00_15 TT0,(i*8+0) + vmovdqa TT0, [YTMP0] + ROUND_00_15 TT1,(i*8+1) + vmovdqa TT1, [YTMP1] + ROUND_00_15 TT2,(i*8+2) + vmovdqa TT2, [YTMP2] + ROUND_00_15 TT3,(i*8+3) + vmovdqa TT3, [YTMP3] + ROUND_00_15 TT0,(i*8+4) + ROUND_00_15 TT1,(i*8+5) + ROUND_00_15 TT2,(i*8+6) + ROUND_00_15 TT3,(i*8+7) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + +%assign i (i*8) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ8] + vpaddd b, b, [rsp + _DIGEST + 1*SZ8] + vpaddd c, c, [rsp + _DIGEST + 2*SZ8] + vpaddd d, d, [rsp + _DIGEST + 3*SZ8] + vpaddd e, e, [rsp + _DIGEST + 4*SZ8] + vpaddd f, f, [rsp + _DIGEST + 5*SZ8] + vpaddd g, g, [rsp + _DIGEST + 6*SZ8] + vpaddd h, h, [rsp + _DIGEST + 7*SZ8] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a + vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b + vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c + vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d + vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e + vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f + vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g + vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _args_data_ptr + 0*8], inp0 + add inp1, IDX + mov [STATE + _args_data_ptr + 1*8], inp1 + add inp2, IDX + mov [STATE + _args_data_ptr + 2*8], inp2 + add inp3, IDX + mov [STATE + _args_data_ptr + 3*8], inp3 + add inp4, IDX + mov [STATE + _args_data_ptr + 4*8], inp4 + add inp5, IDX + mov [STATE + _args_data_ptr + 5*8], inp5 + add inp6, IDX + mov [STATE + _args_data_ptr + 6*8], inp6 + add inp7, IDX + mov [STATE + _args_data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, [rsp + _RSP_SAVE] + ret + +section .data +align 64 +K256_8_MB: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm new file mode 100644 index 000000000..af54f7cc3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm @@ -0,0 +1,125 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha256_ctx_mgr_init_sse +extern sha256_ctx_mgr_submit_sse +extern sha256_ctx_mgr_flush_sse + +extern sha256_ctx_mgr_init_avx +extern sha256_ctx_mgr_submit_avx +extern sha256_ctx_mgr_flush_avx + +extern sha256_ctx_mgr_init_avx2 +extern sha256_ctx_mgr_submit_avx2 +extern sha256_ctx_mgr_flush_avx2 + +extern sha256_ctx_mgr_init_base +extern sha256_ctx_mgr_submit_base +extern sha256_ctx_mgr_flush_base + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha256_ctx_mgr_init_avx512 + extern sha256_ctx_mgr_submit_avx512 + extern sha256_ctx_mgr_flush_avx512 +%endif + +%ifdef HAVE_AS_KNOWS_SHANI + extern sha256_ctx_mgr_init_sse_ni + extern sha256_ctx_mgr_submit_sse_ni + extern sha256_ctx_mgr_flush_sse_ni +%endif + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + extern sha256_ctx_mgr_init_avx512_ni + extern sha256_ctx_mgr_submit_avx512_ni + extern sha256_ctx_mgr_flush_avx512_ni + %endif +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha256_ctx_mgr_init +mbin_interface sha256_ctx_mgr_submit +mbin_interface sha256_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6's extension through replacing base by sse version + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \ + sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \ + sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \ + sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \ + sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \ + sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \ + sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni + %else + mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \ + sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \ + sha256_ctx_mgr_init_avx512 + mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \ + sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \ + sha256_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \ + sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \ + sha256_ctx_mgr_flush_avx512 + %endif +%else + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \ + sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \ + sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \ + sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni + %else + mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \ + sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2 + mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \ + sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2 + mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \ + sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2 + %endif +%endif + +;;; func core, ver, snum +slversion sha256_ctx_mgr_init, 00, 04, 0160 +slversion sha256_ctx_mgr_submit, 00, 04, 0161 +slversion sha256_ctx_mgr_flush, 00, 04, 0162 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm new file mode 100644 index 000000000..25fc9ce16 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm @@ -0,0 +1,361 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define SHUF_MASK xmm8 + +%define ABEF_SAVE xmm9 +%define CDGH_SAVE xmm10 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define TMP r9 ; local variable -- assistant to address digest +%define TBL rax +;%define TMP2 r8 ; local variable -- assistant to address digest +align 32 + +; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r11, xmm0-xmm10 +; +mk_global sha256_ni_x1, function, internal +sha256_ni_x1: + endbranch + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*IDX] + ;; Initialize digest + ;; digests -> ABEF(state0), CDGH(state1) + pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A + pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B + pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D + pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E + pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F + pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + lea TBL, [TABLE] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; /* Save hash values for addition after rounds */ + movdqa ABEF_SAVE, STATE0 + movdqa CDGH_SAVE, STATE1 + + ; /* Rounds 0-3 */ + movdqu MSG, [DPTR + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 4-7 */ + movdqu MSG, [DPTR + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 8-11 */ + movdqu MSG, [DPTR + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 12-15 */ + movdqu MSG, [DPTR + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 16-19 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 20-23 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 24-27 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 28-31 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 32-35 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 36-39 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 40-43 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 44-47 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 48-51 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 52-55 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 56-59 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 60-63 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Add current hash values with previously saved */ + paddd STATE0, ABEF_SAVE + paddd STATE1, CDGH_SAVE + + ; Increment data pointer and loop if more to process + add DPTR, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*IDX] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMP + 0*NLANX4], STATE0, 3 ; A + pextrd [TMP + 1*NLANX4], STATE0, 2 ; B + pextrd [TMP + 2*NLANX4], STATE1, 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], STATE1, 2 ; D + pextrd [TMP + 2*NLANX4], STATE0, 1 ; E + pextrd [TMP + 4*NLANX4], STATE1, 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMP + 2*NLANX4], STATE0, 0 ; F + pextrd [TMP + 4*NLANX4], STATE1, 0 ; H + + ; update input pointers + mov [MGR + _data_ptr + IDX*8], DPTR + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + + ret + + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_ni_x1 +no_sha256_ni_x1: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm new file mode 100644 index 000000000..74cfc93b6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm @@ -0,0 +1,574 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 64 ; space for ABCDE +%define RSPSAVE rax + +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define STATE0b xmm8 +%define STATE1b xmm9 +%define MSGTMP0b xmm10 +%define MSGTMP1b xmm11 +%define MSGTMP2b xmm12 +%define MSGTMP3b xmm13 +%define MSGTMP4b xmm14 + +%define SHUF_MASK xmm15 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define DPTRb r12 +%define TMP r9 ; local variable -- assistant to address digest +%define TBL r13 +%define TMPb r14 ; local variable -- assistant to address digest +align 32 + +; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r14, xmm0-xmm15 +; +mk_global sha256_ni_x2, function, internal +sha256_ni_x2: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*0] + lea TMPb, [MGR + 4*1] + + ;; Initialize digest + ;; digests -> ABEF(state0), CDGH(state1) + pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A + pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B + pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D + pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E + pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F + pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H + + pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A + pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B + pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D + pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E + pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G + lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F + pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + lea TBL, [TABLE] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + 8*0] + mov DPTRb,[MGR + _data_ptr + 8*1] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; /* Save hash values for addition after rounds */ + movdqa [rsp + 0*16], STATE0 + movdqa [rsp + 1*16], STATE1 + + movdqa [rsp + 2*16], STATE0b + movdqa [rsp + 3*16], STATE1b + + ; /* Rounds 0-3 */ + movdqu MSG, [DPTR + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqu MSG, [DPTRb + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0b, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 4-7 */ + movdqu MSG, [DPTR + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqu MSG, [DPTRb + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1b, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 8-11 */ + movdqu MSG, [DPTR + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqu MSG, [DPTRb + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2b, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 12-15 */ + movdqu MSG, [DPTR + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqu MSG, [DPTRb + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3b, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 16-19 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 20-23 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 24-27 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 28-31 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 32-35 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 36-39 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 40-43 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 44-47 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 48-51 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 52-55 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 56-59 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 60-63 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Add current hash values with previously saved */ + paddd STATE0, [rsp + 0*16] + paddd STATE1, [rsp + 1*16] + + paddd STATE0b, [rsp + 2*16] + paddd STATE1b, [rsp + 3*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + add DPTRb, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*0] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMP + 0*NLANX4], STATE0, 3 ; A + pextrd [TMP + 1*NLANX4], STATE0, 2 ; B + pextrd [TMP + 2*NLANX4], STATE1, 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], STATE1, 2 ; D + pextrd [TMP + 2*NLANX4], STATE0, 1 ; E + pextrd [TMP + 4*NLANX4], STATE1, 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMP + 2*NLANX4], STATE0, 0 ; F + pextrd [TMP + 4*NLANX4], STATE1, 0 ; H + + lea TMPb, [MGR + 4*1] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A + pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B + pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D + pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E + pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G + lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F + pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H + + ; update input pointers + mov [MGR + _data_ptr + 0*8], DPTR + mov [MGR + _data_ptr + 1*8], DPTRb + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, RSPSAVE + + ret + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_ni_x2 +no_sha256_ni_x2: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm new file mode 100644 index 000000000..fc13ec279 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm @@ -0,0 +1,567 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Implement fast SHA-256 with SSSE3 instructions. (x86_64) +; +; Copyright (C) 2013 Intel Corporation. +; +; Authors: +; James Guilford +; Kirk Yap +; Tim Chen +; Transcoded by: +; Xiaodong Liu +; +; This software is available to you under the OpenIB.org BSD license +; below: +; +; Redistribution and use in source and binary forms, with or +; without modification, are permitted provided that the following +; conditions are met: +; +; - Redistributions of source code must retain the above +; copyright notice, this list of conditions and the following +; disclaimer. +; +; - Redistributions in binary form must reproduce the above +; copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials +; provided with the distribution. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +; SOFTWARE. +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +%xdefine X0 xmm4 +%xdefine X1 xmm5 +%xdefine X2 xmm6 +%xdefine X3 xmm7 + +%xdefine XTMP0 xmm0 +%xdefine XTMP1 xmm1 +%xdefine XTMP2 xmm2 +%xdefine XTMP3 xmm3 +%xdefine XTMP4 xmm8 +%xdefine XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 ; rdi or rcx +%define NBLK arg1 ; rsi or rdx +%define IDX r8 ; local variable -- consistent with caller +%define NLANX4 r10 ; consistent with caller, should be r10 + +%define TMGR r9 ; data pointer stored in stack named _TMGR +%define INP r9 ; data pointer stored in stack named _INP +%define SRND r9 ; clobbers INP +%define TMP r9 ; local variable -- assistant to address digest + +%xdefine TBL rbp +%xdefine c ecx +%xdefine d esi +%xdefine e edx +%xdefine a eax +%xdefine b ebx + +%xdefine f edi +%xdefine g r12d +%xdefine h r11d + +%xdefine y0 r13d +%xdefine y1 r14d +%xdefine y2 r15d + + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes +%define _INP_END_SIZE 8 +%define _INP_SIZE 8 +%define _TMGR_SIZE 8 +%define _XFER_SIZE 16 +%define _XMM_SAVE_SIZE 0 +%define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 + +%define _STACK_ALIGN 0 +%define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE) +%define _INP (_INP_END + _INP_END_SIZE) +%define _TMGR (_INP + _INP_SIZE) +%define _XFER (_TMGR + _TMGR_SIZE) +%define _XMM_SAVE (_XFER + _XFER_SIZE) +%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE) +%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE) + +;; assume buffers not aligned +%define MOVDQ movdqu + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 ;changed + mov %1, %2 ;changed +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 ;changed + pshufb %1, %3 ;changed +%endmacro + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endmacro + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endmacro + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) ; + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 ; + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) ; + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 ; + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25 + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22 + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 ; + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + rotate_Xs +%endmacro + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + %xdefine offset (%1 * 4 + _XFER) + add y2, [rsp + offset] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: all general regs, xmm0-xmm12 +; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text +mk_global sha256_opt_x1, function, internal +sha256_opt_x1: + endbranch + sub rsp, STACK_SIZE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rdi + mov [rsp + _GPR_SAVE + 8*3], rsi + ; caller has already stored XMM6~10 +%endif + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 + mov [rsp + _GPR_SAVE + 8*8], rdx + + shl NBLK, 6 ; convert to bytes + jz done_hash + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + mov [rsp + _TMGR], MGR + ;; Load input pointers + mov INP, [MGR + _data_ptr + IDX*8] + mov [rsp + _INP], INP + ;; nblk is used to indicate data end + add NBLK, INP + mov [rsp + _INP_END], NBLK ; pointer to end of data + + + mov TMGR, [rsp + _TMGR] + ;; load initial digest + lea TMP, [TMGR + 4*IDX] + mov a, [TMP + 0*NLANX4] + mov b, [TMP + 1*NLANX4] + mov c, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + mov d, [TMP + 1*NLANX4] + mov e, [TMP + 2*NLANX4] + mov g, [TMP + 4*NLANX4] + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 + mov f, [TMP + 2*NLANX4] + mov h, [TMP + 4*NLANX4] + + movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK] + movdqa SHUF_00BA, [_SHUF_00BA] + movdqa SHUF_DC00, [_SHUF_DC00] + + mov INP, [rsp + _INP] +loop0: + lea TBL, [K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 + +loop1: + movdqa XFER, [TBL] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + ; write out digests + mov TMGR, [rsp + _TMGR] + lea TMP, [TMGR + 4*IDX] + addm a, [TMP + 0*NLANX4] + addm b, [TMP + 1*NLANX4] + addm c, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + addm d, [TMP + 1*NLANX4] + addm e, [TMP + 2*NLANX4] + addm g, [TMP + 4*NLANX4] + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 + addm f, [TMP + 2*NLANX4] + addm h, [TMP + 4*NLANX4] + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne loop0 + +done_hash: + mov MGR, [rsp + _TMGR] + + mov rdx, [rsp + _GPR_SAVE + 8*8] + mov r15, [rsp + _GPR_SAVE + 8*7] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r12, [rsp + _GPR_SAVE + 8*4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + _GPR_SAVE + 8*3] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbp, [rsp + _GPR_SAVE + 8*1] + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SIZE + + ret + +section .data +align 64 +K256: + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + DQ 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: + DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF + +; shuffle xDxC -> DC00 +_SHUF_DC00: + DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c new file mode 100644 index 000000000..c3515dc52 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c @@ -0,0 +1,204 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha256_mb.h" +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA256 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define H0 0x6a09e667 +#define H1 0xbb67ae85 +#define H2 0x3c6ef372 +#define H3 0xa54ff53a +#define H4 0x510e527f +#define H5 0x9b05688c +#define H6 0x1f83d9ab +#define H7 0x5be0cd19 + +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define W(x) w[(x) & 15] + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]); + +void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + digest[5] = H5; + digest[6] = H6; + digest[7] = H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single(buf, digest); + if (i == 2 * SHA256_BLOCK_SIZE) + sha256_single(buf + SHA256_BLOCK_SIZE, digest); +} + +void sha256_single(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am new file mode 100644 index 000000000..4ba7d1049 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am @@ -0,0 +1,108 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sha512_mb/sha512_ctx_sse.c \ + sha512_mb/sha512_ctx_avx.c \ + sha512_mb/sha512_ctx_avx2.c \ + sha512_mb/sha512_ctx_sb_sse4.c \ + sha512_mb/sha512_ctx_base.c + +lsrc_x86_64 += sha512_mb/sha512_mb_mgr_init_sse.c \ + sha512_mb/sha512_mb_mgr_init_avx2.c \ + sha512_mb/sha512_sb_mgr_init_sse4.c + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_x86_64 += sha512_mb/sha512_mb_mgr_submit_sse.asm \ + sha512_mb/sha512_mb_mgr_submit_avx.asm \ + sha512_mb/sha512_mb_mgr_submit_avx2.asm \ + sha512_mb/sha512_mb_mgr_flush_sse.asm \ + sha512_mb/sha512_mb_mgr_flush_avx.asm \ + sha512_mb/sha512_mb_mgr_flush_avx2.asm \ + sha512_mb/sha512_mb_x2_sse.asm \ + sha512_mb/sha512_mb_x2_avx.asm \ + sha512_mb/sha512_mb_x4_avx2.asm \ + sha512_mb/sha512_multibinary.asm \ + sha512_mb/sha512_sb_mgr_submit_sse4.c \ + sha512_mb/sha512_sb_mgr_flush_sse4.c \ + sha512_mb/sha512_sse4.asm + +lsrc_x86_64 += sha512_mb/sha512_ctx_avx512.c \ + sha512_mb/sha512_mb_mgr_init_avx512.c \ + sha512_mb/sha512_mb_mgr_submit_avx512.asm \ + sha512_mb/sha512_mb_mgr_flush_avx512.asm \ + sha512_mb/sha512_mb_x8_avx512.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += sha512_mb/sha512_ctx_base.c \ + sha512_mb/aarch64/sha512_mb_multibinary.S \ + sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c \ + sha512_mb/aarch64/sha512_ctx_ce.c \ + sha512_mb/aarch64/sha512_mb_mgr_ce.c \ + sha512_mb/aarch64/sha512_mb_x1_ce.S \ + sha512_mb/aarch64/sha512_mb_x2_ce.S + +lsrc_base_aliases += sha512_mb/sha512_ctx_base.c \ + sha512_mb/sha512_ctx_base_aliases.c + +src_include += -I $(srcdir)/sha512_mb + +extern_hdrs += include/sha512_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + sha512_mb/sha512_job.asm \ + sha512_mb/sha512_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha512_mb/sha512_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha512_mb/sha512_mb_test \ + sha512_mb/sha512_mb_rand_test \ + sha512_mb/sha512_mb_rand_update_test + +unit_tests += sha512_mb/sha512_mb_rand_ssl_test + +perf_tests += sha512_mb/sha512_mb_vs_ossl_perf + +sha512_mb_rand_test: sha512_ref.o +sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la + +sha512_mb_rand_update_test: sha512_ref.o +sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la + +sha512_mb_rand_ssl_test: LDLIBS += -lcrypto +sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c new file mode 100644 index 000000000..02f04197b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state); +SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job); +SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_ce(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_ce(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_ce(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_ce(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_ce(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_ce_slver_02020142; +struct slver sha512_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_submit_ce_slver_02020143; +struct slver sha512_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_flush_ce_slver_02020144; +struct slver sha512_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..321e8507d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c @@ -0,0 +1,59 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA3) + return PROVIDER_INFO(sha512_ctx_mgr_submit_ce); + + return PROVIDER_BASIC(sha512_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA3) + return PROVIDER_INFO(sha512_ctx_mgr_init_ce); + + return PROVIDER_BASIC(sha512_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA3) + return PROVIDER_INFO(sha512_ctx_mgr_flush_ce); + + return PROVIDER_BASIC(sha512_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c new file mode 100644 index 000000000..43801c3d6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c @@ -0,0 +1,210 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif +#ifndef SHA512_MB_CE_MAX_LANES +#define SHA512_MB_CE_MAX_LANES 2 +#endif + +#if SHA512_MB_CE_MAX_LANES >=2 +void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int); +#endif +void sha512_mb_ce_x1(SHA512_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state) +{ + int i; + //~ state->unused_lanes = 0xf3210; + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = SHA512_MB_CE_MAX_LANES - 1; i >= 0; i--) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SHA1_MB_CE_MAX_LANES is invalid lane + for (i = SHA512_MB_CE_MAX_LANES; i < SHA512_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha512_mb_mgr_do_jobs(SHA512_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SHA512_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SHA512_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha512_mb_ce_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SHA512_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); + +#if SHA512_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sha512_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sha512_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SHA512_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 3; + } + } + + return lane_idx; + +} + +static SHA512_JOB *sha512_mb_mgr_free_lane(SHA512_MB_JOB_MGR * state) +{ + int i; + SHA512_JOB *ret = NULL; + + for (i = 0; i < SHA512_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha512_mb_mgr_insert_job(SHA512_MB_JOB_MGR * state, SHA512_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SHA512_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA512_JOB *ret; + + //add job into lanes + sha512_mb_mgr_insert_job(state, job); + + ret = sha512_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SHA512_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha512_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha512_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sha512_mb_mgr_free_lane(state); + return ret; +} + +SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state) +{ + SHA512_JOB *ret; + ret = sha512_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha512_mb_mgr_do_jobs(state); + return sha512_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S new file mode 100644 index 000000000..58bf13478 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include + + +mbin_interface sha512_ctx_mgr_submit +mbin_interface sha512_ctx_mgr_init +mbin_interface sha512_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S new file mode 100644 index 000000000..ab5d0aed7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+crypto+sha3 + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 0-63 +*/ +.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7 + ldr key_q , [key_adr] + add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d + add key_adr,key_adr,16 + ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8 + ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8 + ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8 + add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d + ext l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8 + sha512su0 l0_\msg0\()_v.2d,l0_\msg1\()_v.2d + sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d + sha512su1 l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d + add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d + sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d +.endm +/** +maros for round 64-79 +*/ +.macro sha512_rounds_high ab,cd,ef,gh,tmp,msg0 + ldr key_q , [key_adr] + add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d + add key_adr,key_adr,16 + ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8 + ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8 + ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8 + add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d + sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d + add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d + sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_ab,0 + declare_var_vector_reg l0_cd,1 + declare_var_vector_reg l0_ef,2 + declare_var_vector_reg l0_gh,3 + + declare_var_vector_reg l0_tmp,4 + declare_var_vector_reg l0_ab_saved,24 + declare_var_vector_reg l0_cd_saved,25 + declare_var_vector_reg l0_ef_saved,26 + declare_var_vector_reg l0_gh_saved,27 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,5 + declare_var_vector_reg l0_tmp1,6 + declare_var_vector_reg l0_tmp2,7 + +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l0_msg4,20 + declare_var_vector_reg l0_msg5,21 + declare_var_vector_reg l0_msg6,22 + declare_var_vector_reg l0_msg7,23 + + + +/* + void sha512_mb_ce_x1(SHA1_JOB * l0_job, int len); +*/ +/* +Arguements list +*/ + l0_job .req x0 + len .req w1 + l0_data .req x2 + key_adr .req x3 + .global sha512_mb_ce_x1 + .type sha512_mb_ce_x1, %function +sha512_mb_ce_x1: + ldr l0_data, [l0_job] + // load initial digest + add x4,l0_job,64 + ld1 {l0_ab_v.4s-l0_gh_v.4s},[x4] + + + +start_loop: + adr key_adr, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + add l0_data,l0_data,64 + ld1 {l0_msg4_v.4s-l0_msg7_v.4s},[l0_data] + add l0_data,l0_data,64 + //adjust loop parameter + + sub len, len, #1 + cmp len, 0 + + //save state + mov l0_ab_saved_v.16b,l0_ab_v.16b + mov l0_cd_saved_v.16b,l0_cd_v.16b + mov l0_ef_saved_v.16b,l0_ef_v.16b + mov l0_gh_saved_v.16b,l0_gh_v.16b + + //rev endian + rev64 l0_msg0_v.16b,l0_msg0_v.16b + rev64 l0_msg1_v.16b,l0_msg1_v.16b + rev64 l0_msg2_v.16b,l0_msg2_v.16b + rev64 l0_msg3_v.16b,l0_msg3_v.16b + rev64 l0_msg4_v.16b,l0_msg4_v.16b + rev64 l0_msg5_v.16b,l0_msg5_v.16b + rev64 l0_msg6_v.16b,l0_msg6_v.16b + rev64 l0_msg7_v.16b,l0_msg7_v.16b + + + + sha512_rounds_low ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7 /* rounds 0- 1 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0 /* rounds 2- 3 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1 /* rounds 4- 5 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2 /* rounds 6- 7 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3 /* rounds 8- 9 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4 /* rounds 10-11 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5 /* rounds 12-13 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6 /* rounds 14-15 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7 /* rounds 16-17 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0 /* rounds 18-19 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1 /* rounds 20-21 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2 /* rounds 22-23 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3 /* rounds 24-25 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4 /* rounds 26-27 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5 /* rounds 28-29 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6 /* rounds 30-31 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7 /* rounds 32-33 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0 /* rounds 34-35 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1 /* rounds 36-37 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2 /* rounds 38-39 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3 /* rounds 40-41 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4 /* rounds 42-43 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5 /* rounds 44-45 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6 /* rounds 46-47 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7 /* rounds 48-49 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0 /* rounds 50-51 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1 /* rounds 52-53 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2 /* rounds 54-55 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3 /* rounds 56-57 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4 /* rounds 58-59 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5 /* rounds 60-61 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6 /* rounds 62-63 */ + sha512_rounds_high ef, gh, cd,tmp, ab,msg0 /* rounds 64-65 */ + sha512_rounds_high tmp, ef, ab, cd, gh,msg1 /* rounds 66-67 */ + sha512_rounds_high cd,tmp, gh, ab, ef,msg2 /* rounds 68-69 */ + sha512_rounds_high ab, cd, ef, gh,tmp,msg3 /* rounds 70-71 */ + sha512_rounds_high gh, ab,tmp, ef, cd,msg4 /* rounds 72-73 */ + sha512_rounds_high ef, gh, cd,tmp, ab,msg5 /* rounds 74-75 */ + sha512_rounds_high tmp, ef, ab, cd, gh,msg6 /* rounds 76-77 */ + sha512_rounds_high cd,tmp, gh, ab, ef,msg7 /* rounds 78-79 */ + + + + add l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d + add l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d + add l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d + add l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d + + + bgt start_loop + + add x4,l0_job,64 + st1 {l0_ab_v.4s-l0_gh_v.4s},[x4] + + + ret + + .size sha512_mb_ce_x1, .-sha512_mb_ce_x1 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S new file mode 100644 index 000000000..7864eb028 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S @@ -0,0 +1,390 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+crypto+sha3 + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 0-63 +*/ +.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7 + ldr key_q , [key_adr] + add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d + add l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d + add key_adr,key_adr,16 + + + ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8 + ext l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8 + + + ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8 + ext l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8 + + + ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8 + ext l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8 + + + add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d + add l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d + + + ext l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8 + ext l1_tmp0_v.16b,l1_\msg4\()_v.16b,l1_\msg5\()_v.16b,#8 + + sha512su0 l0_\msg0\()_v.2d,l0_\msg1\()_v.2d + sha512su0 l1_\msg0\()_v.2d,l1_\msg1\()_v.2d + + sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d + sha512h l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d + + sha512su1 l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d + sha512su1 l1_\msg0\()_v.2d,l1_\msg7\()_v.2d,l1_tmp0_v.2d + + add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d + add l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d + + sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d + sha512h2 l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d +.endm + +/** +maros for round 64-79 +*/ +.macro sha512_rounds_high ab,cd,ef,gh,tmp,msg0 + ldr key_q , [key_adr] + add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d + add l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d + add key_adr,key_adr,16 + + + ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8 + ext l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8 + + + ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8 + ext l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8 + + + ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8 + ext l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8 + + + add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d + add l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d + + + + sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d + sha512h l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d + + + add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d + add l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d + + sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d + sha512h2 l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,6 + + +/* +digest variables +*/ + declare_var_vector_reg l0_ab,0 + declare_var_vector_reg l0_cd,1 + declare_var_vector_reg l0_ef,2 + declare_var_vector_reg l0_gh,3 + declare_var_vector_reg l0_tmp,4 + + declare_var_vector_reg l1_ab,8 + declare_var_vector_reg l1_cd,9 + declare_var_vector_reg l1_ef,10 + declare_var_vector_reg l1_gh,11 + declare_var_vector_reg l1_tmp,12 + + + declare_var_vector_reg l0_ab_saved,16 + declare_var_vector_reg l0_cd_saved,17 + declare_var_vector_reg l0_ef_saved,18 + declare_var_vector_reg l0_gh_saved,19 + declare_var_vector_reg l1_ab_saved,24 + declare_var_vector_reg l1_cd_saved,25 + declare_var_vector_reg l1_ef_saved,26 + declare_var_vector_reg l1_gh_saved,27 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,5 + declare_var_vector_reg l0_tmp1,6 + declare_var_vector_reg l0_tmp2,7 + + declare_var_vector_reg l1_tmp0,13 + declare_var_vector_reg l1_tmp1,14 + declare_var_vector_reg l1_tmp2,15 + + + +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l0_msg4,20 + declare_var_vector_reg l0_msg5,21 + declare_var_vector_reg l0_msg6,22 + declare_var_vector_reg l0_msg7,23 + + declare_var_vector_reg l1_msg0,24 + declare_var_vector_reg l1_msg1,25 + declare_var_vector_reg l1_msg2,26 + declare_var_vector_reg l1_msg3,27 + declare_var_vector_reg l1_msg4,28 + declare_var_vector_reg l1_msg5,29 + declare_var_vector_reg l1_msg6,30 + declare_var_vector_reg l1_msg7,31 + + + +/* + void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + len .req w2 + l0_data .req x3 + l1_data .req x4 + key_adr .req x5 + l0_digest_adr .req x6 + l1_digest_adr .req x7 + .global sha512_mb_ce_x2 + .type sha512_mb_ce_x2, %function +sha512_mb_ce_x2: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + ldr l0_data, [l0_job] + ldr l1_data, [l1_job] + // load initial digest + add l0_digest_adr,l0_job,64 + add l1_digest_adr,l1_job,64 + ld1 {l0_ab_v.4s-l0_gh_v.4s},[l0_digest_adr] + ld1 {l1_ab_v.4s-l1_gh_v.4s},[l1_digest_adr] + + + +start_loop: + + adr key_adr, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + add l0_data,l0_data,64 + ld1 {l0_msg4_v.4s-l0_msg7_v.4s},[l0_data] + add l0_data,l0_data,64 + + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + add l1_data,l1_data,64 + ld1 {l1_msg4_v.4s-l1_msg7_v.4s},[l1_data] + add l1_data,l1_data,64 + + //adjust loop parameter + sub len, len, #1 + cmp len, 0 + + + + //rev endian + rev64 l0_msg0_v.16b,l0_msg0_v.16b + rev64 l0_msg1_v.16b,l0_msg1_v.16b + rev64 l0_msg2_v.16b,l0_msg2_v.16b + rev64 l0_msg3_v.16b,l0_msg3_v.16b + rev64 l0_msg4_v.16b,l0_msg4_v.16b + rev64 l0_msg5_v.16b,l0_msg5_v.16b + rev64 l0_msg6_v.16b,l0_msg6_v.16b + rev64 l0_msg7_v.16b,l0_msg7_v.16b + + rev64 l1_msg0_v.16b,l1_msg0_v.16b + rev64 l1_msg1_v.16b,l1_msg1_v.16b + rev64 l1_msg2_v.16b,l1_msg2_v.16b + rev64 l1_msg3_v.16b,l1_msg3_v.16b + rev64 l1_msg4_v.16b,l1_msg4_v.16b + rev64 l1_msg5_v.16b,l1_msg5_v.16b + rev64 l1_msg6_v.16b,l1_msg6_v.16b + rev64 l1_msg7_v.16b,l1_msg7_v.16b + + + + + + + + + + sha512_rounds_low ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7 /* rounds 0- 1 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0 /* rounds 2- 3 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1 /* rounds 4- 5 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2 /* rounds 6- 7 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3 /* rounds 8- 9 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4 /* rounds 10-11 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5 /* rounds 12-13 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6 /* rounds 14-15 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7 /* rounds 16-17 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0 /* rounds 18-19 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1 /* rounds 20-21 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2 /* rounds 22-23 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3 /* rounds 24-25 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4 /* rounds 26-27 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5 /* rounds 28-29 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6 /* rounds 30-31 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7 /* rounds 32-33 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0 /* rounds 34-35 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1 /* rounds 36-37 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2 /* rounds 38-39 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3 /* rounds 40-41 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4 /* rounds 42-43 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5 /* rounds 44-45 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6 /* rounds 46-47 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7 /* rounds 48-49 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0 /* rounds 50-51 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1 /* rounds 52-53 */ + sha512_rounds_low ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2 /* rounds 54-55 */ + sha512_rounds_low tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3 /* rounds 56-57 */ + sha512_rounds_low cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4 /* rounds 58-59 */ + sha512_rounds_low ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5 /* rounds 60-61 */ + sha512_rounds_low gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6 /* rounds 62-63 */ + sha512_rounds_high ef, gh, cd,tmp, ab,msg0 /* rounds 64-65 */ + sha512_rounds_high tmp, ef, ab, cd, gh,msg1 /* rounds 66-67 */ + sha512_rounds_high cd,tmp, gh, ab, ef,msg2 /* rounds 68-69 */ + sha512_rounds_high ab, cd, ef, gh,tmp,msg3 /* rounds 70-71 */ + ld1 {l0_ab_saved_v.4s-l0_gh_saved_v.4s},[l0_digest_adr] + ld1 {l1_ab_saved_v.4s-l1_gh_saved_v.4s},[l1_digest_adr] + sha512_rounds_high gh, ab,tmp, ef, cd,msg4 /* rounds 72-73 */ + sha512_rounds_high ef, gh, cd,tmp, ab,msg5 /* rounds 74-75 */ + sha512_rounds_high tmp, ef, ab, cd, gh,msg6 /* rounds 76-77 */ + sha512_rounds_high cd,tmp, gh, ab, ef,msg7 /* rounds 78-79 */ + + + + add l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d + add l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d + add l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d + add l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d + st1 {l0_ab_v.2d-l0_gh_v.2d},[l0_digest_adr] + + add l1_ab_v.2d,l1_ab_v.2d,l1_ab_saved_v.2d + add l1_cd_v.2d,l1_cd_v.2d,l1_cd_saved_v.2d + add l1_ef_v.2d,l1_ef_v.2d,l1_ef_saved_v.2d + add l1_gh_v.2d,l1_gh_v.2d,l1_gh_saved_v.2d + st1 {l1_ab_v.2d-l1_gh_v.2d},[l1_digest_adr] + + + + + bgt start_loop + + add x4,l0_job,64 + + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + + ret + + .size sha512_mb_ce_x2, .-sha512_mb_ce_x2 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c new file mode 100644 index 000000000..24d96763a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx_slver_02020166; +struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_submit_avx_slver_02020167; +struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_flush_avx_slver_02020168; +struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c new file mode 100644 index 000000000..9923e2097 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx2_slver_04020169; +struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 }; + +struct slver sha512_ctx_mgr_submit_avx2_slver_04020170; +struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 }; + +struct slver sha512_ctx_mgr_flush_avx2_slver_04020171; +struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c new file mode 100644 index 000000000..5c0757716 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c @@ -0,0 +1,274 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx512_slver_0600016a; +struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 }; + +struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b; +struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 }; + +struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c; +struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c new file mode 100644 index 000000000..61a8fa000 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c @@ -0,0 +1,323 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words + * instead of 32 bit. + */ +#define ch(e,f,g) ((e & f) ^ (g & ~e)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) + +/* Sigma functions have same form as SHA256 but + * - change the word size to 64bit + * - change the amount to rotate + */ +#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r)))) + +/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case + * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions. + */ +#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39)) +#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41)) + +#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7)) +#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6)) + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be64(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len); +static void sha512_single(const void *data, uint64_t digest[]); +static inline void hash_init_digest(SHA512_WORD_T * digest); + +void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr) +{ +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + sha512_init(ctx, buffer, len); + sha512_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + sha512_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sha512_update(ctx, buffer, len); + sha512_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sha512_init(ctx, buffer, len); + remain_len = sha512_update(ctx, buffer, len); + sha512_final(ctx, remain_len); + } + + return ctx; +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint64_t *digest = ctx->job.result_digest; + + while (remain_len >= SHA512_BLOCK_SIZE) { + sha512_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SHA512_BLOCK_SIZE); + remain_len -= SHA512_BLOCK_SIZE; + ctx->total_length += SHA512_BLOCK_SIZE; + } + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[2 * SHA512_BLOCK_SIZE]; + uint64_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++) + buf[j] = 0; + + if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE) + i = 2 * SHA512_BLOCK_SIZE; + else + i = SHA512_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sha512_single(buf, digest); + if (i == 2 * SHA512_BLOCK_SIZE) { + sha512_single(buf + SHA512_BLOCK_SIZE, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +void sha512_single(const void *data, uint64_t digest[]) +{ + /* Check these are all uint64_t */ + uint64_t a, b, c, d, e, f, g, h, t1, t2; + uint64_t w[16]; + uint64_t *ww = (uint64_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22); + step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc); + step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242); + step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe); + step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4); + step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5); + step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210); + step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f); + step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70); + step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc); + step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed); + step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df); + step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6); + step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364); + step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30); + step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218); + step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910); + step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a); + step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8); + step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53); + step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60); + step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72); + step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec); + step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28); + step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63 + step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c); + step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207); + step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e); + step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178); + step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba); + step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6); + step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae); + step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b); + step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84); + step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493); + step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc); + step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c); + step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6); + step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a); + step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec); + step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79 + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_base_slver_000002f3; +struct slver sha512_ctx_mgr_init_base_slver = { 0x02f3, 0x00, 0x00 }; + +struct slver sha512_ctx_mgr_submit_base_slver_000002f4; +struct slver sha512_ctx_mgr_submit_base_slver = { 0x02f4, 0x00, 0x00 }; + +struct slver sha512_ctx_mgr_flush_base_slver_000002f5; +struct slver sha512_ctx_mgr_flush_base_slver = { 0x02f5, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c new file mode 100644 index 000000000..9890c2c47 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include "sha512_mb.h" +#include "memcpy_inline.h" + +extern void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr); +extern SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags); +extern SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr); + +void sha512_ctx_mgr_init(SHA512_HASH_CTX_MGR * mgr) +{ + return sha512_ctx_mgr_init_base(mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sha512_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush(SHA512_HASH_CTX_MGR * mgr) +{ + return sha512_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c new file mode 100644 index 000000000..94c32d260 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c @@ -0,0 +1,255 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_sb_mgr_init_sse4(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172; +struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 }; + +struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173; +struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 }; + +struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174; +struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c new file mode 100644 index 000000000..b73619875 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c @@ -0,0 +1,255 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_sse(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_sse_slver_00020163; +struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 }; + +struct slver sha512_ctx_mgr_submit_sse_slver_00020164; +struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 }; + +struct slver sha512_ctx_mgr_flush_sse_slver_00020165; +struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm new file mode 100644 index 000000000..4423cdcb5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm @@ -0,0 +1,54 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA512_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; JOB_SHA512 + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*8, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SHA512_JOB_size _FIELD_OFFSET +%assign _SHA512_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm new file mode 100644 index 000000000..f54135da3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm @@ -0,0 +1,72 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA512 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA512_ARGS_X8 +;;; name size align +FIELD _digest, 8*8*8, 4 ; transposed digest +FIELD _data_ptr, 8*8, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA512_ARGS_X4_size _FIELD_OFFSET +%assign _SHA512_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA512_ARGS_X8_size _FIELD_OFFSET +%assign _SHA512_ARGS_X8_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align +FIELD _lens, 8*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..65ce43d3a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm @@ -0,0 +1,224 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_avx + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha512_mb_mgr_flush_avx, function +sha512_mb_mgr_flush_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..33a24a6b9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm @@ -0,0 +1,245 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x4_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.xmm +%define _GPR_SAVE stack_frame.gpr +%define STACK_SPACE stack_frame_size + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha512_mb_mgr_flush_avx2, function +sha512_mb_mgr_flush_avx2: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + mov lens2, [state + _lens + 2*8] + cmp lens2, idx + cmovb idx, lens2 + mov lens3, [state + _lens + 3*8] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + mov [state + _lens + 2*8], lens2 + mov [state + _lens + 3*8], lens3 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x4_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..795027c6b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm @@ -0,0 +1,270 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern sha512_mb_x8_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define num_lanes_inuse r9 +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*8 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.xmm +%define _GPR_SAVE stack_frame.gpr +%define STACK_SPACE stack_frame_size + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha512_mb_mgr_flush_avx512, function +sha512_mb_mgr_flush_avx512: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqu [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqu [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqu [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqu [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqu [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqu [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqu [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqu [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqu [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqu [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 7 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx + vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} + vmovdqu ymm1, [state + _lens + 1*32] + + vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword + + vmovq idx, xmm2 + mov len2, idx + and idx, 0xF + shr len2, 32 ; SHA512 blocksize is 1024bit + jz len_is_0 + + vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} + vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} + vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x8_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovq xmm0, [state + _args_digest + 8*idx + 0*64] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*64] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*64] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*64] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqu xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqu xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqu xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqu xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqu xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqu xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqu xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqu xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqu xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index + dq 0xFFFFFFFF00000000, 0x0000000000000000 + dq 0xFFFFFFFF00000000, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_mgr_flush_avx512 +no_sha512_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..8a58bf879 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm @@ -0,0 +1,227 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_sse + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha512_mb_mgr_flush_sse, function +sha512_mb_mgr_flush_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_sse + ; state and idx are intact + + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movq xmm0, [state + _args_digest + 8*idx + 0*32] + pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + movq xmm1, [state + _args_digest + 8*idx + 2*32] + pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + movq xmm2, [state + _args_digest + 8*idx + 4*32] + pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + movq xmm3, [state + _args_digest + 8*idx + 6*32] + pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + movdqa [job_rax + _result_digest + 2*16], xmm2 + movdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c new file mode 100644 index 000000000..7ca997653 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c @@ -0,0 +1,45 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->lens[2] = 2; + state->lens[3] = 3; + state->unused_lanes = 0xFF03020100; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA512_X4_LANES; j++) { + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c new file mode 100644 index 000000000..bca9549d9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c @@ -0,0 +1,42 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->unused_lanes = 0x0706050403020100; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA512_MAX_LANES; j++) { + state->lens[j] = j; // sha512_mb uses low 32bit of lens to hold idx exclusively + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c new file mode 100644 index 000000000..0e9ec257f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c @@ -0,0 +1,43 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->unused_lanes = 0xFF0100; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA512_MIN_LANES; j++) { + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..1e3b1b1bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm @@ -0,0 +1,262 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_avx + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha512_mb_mgr_submit_avx, function +sha512_mb_mgr_submit_avx: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*32], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*32], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*32], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*32], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_avx + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..c425c5bb9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm @@ -0,0 +1,270 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x4_avx2 + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha512_mb_mgr_submit_avx2, function +sha512_mb_mgr_submit_avx2: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*32], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*32], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*32], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*32], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + mov lens2, [state + _lens + 2*8] + cmp lens2, idx + cmovb idx, lens2 + mov lens3, [state + _lens + 3*8] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + mov [state + _lens + 2*8], lens2 + mov [state + _lens + 3*8], lens3 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x4_avx2 + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..23b1b5c27 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm @@ -0,0 +1,280 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern sha512_mb_x8_avx512 + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define num_lanes_inuse r9 +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*8 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha512_mb_mgr_submit_avx512, function +sha512_mb_mgr_submit_avx512: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 + mov [rsp + _XMM_SAVE + 8*5], r13 + mov [rsp + _XMM_SAVE + 8*6], r14 + mov [rsp + _XMM_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqu [rsp + 16*0], xmm6 + vmovdqu [rsp + 16*1], xmm7 + vmovdqu [rsp + 16*2], xmm8 + vmovdqu [rsp + 16*3], xmm9 + vmovdqu [rsp + 16*4], xmm10 + vmovdqu [rsp + 16*5], xmm11 + vmovdqu [rsp + 16*6], xmm12 + vmovdqu [rsp + 16*7], xmm13 + vmovdqu [rsp + 16*8], xmm14 + vmovdqu [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*64], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*64], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*64], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*64], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 8 + jne return_null + +start_loop: + ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx + vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} + vmovdqu ymm1, [state + _lens + 1*32] + + vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword + + vmovq idx, xmm2 + mov len2, idx + and idx, 0xF + shr len2, 32 + jz len_is_0 + + + vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} + vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} + vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x8_avx512 + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovq xmm0, [state + _args_digest + 8*idx + 0*64] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*64] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*64] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*64] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 16*0] + vmovdqu xmm7, [rsp + 16*1] + vmovdqu xmm8, [rsp + 16*2] + vmovdqu xmm9, [rsp + 16*3] + vmovdqu xmm10, [rsp + 16*4] + vmovdqu xmm11, [rsp + 16*5] + vmovdqu xmm12, [rsp + 16*6] + vmovdqu xmm13, [rsp + 16*7] + vmovdqu xmm14, [rsp + 16*8] + vmovdqu xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov r13, [rsp + _XMM_SAVE + 8*5] + mov r14, [rsp + _XMM_SAVE + 8*6] + mov r15, [rsp + _XMM_SAVE + 8*7] + + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index + dq 0xFFFFFFFF00000000, 0x0000000000000000 + dq 0xFFFFFFFF00000000, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_mgr_submit_avx512 +no_sha512_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..ba12d586b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm @@ -0,0 +1,260 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_sse + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha512_mb_mgr_submit_sse, function +sha512_mb_mgr_submit_sse: + endbranch + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movdqa xmm2, [job + _result_digest + 2*16] + movdqa xmm3, [job + _result_digest + 3*16] + movq [state + _args_digest + 8*lane + 0*32], xmm0 + pextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + movq [state + _args_digest + 8*lane + 2*32], xmm1 + pextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + movq [state + _args_digest + 8*lane + 4*32], xmm2 + pextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + movq [state + _args_digest + 8*lane + 6*32], xmm3 + pextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_sse + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movq xmm0, [state + _args_digest + 8*idx + 0*32] + pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + movq xmm1, [state + _args_digest + 8*idx + 2*32] + pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + movq xmm2, [state + _args_digest + 8*idx + 4*32] + pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + movq xmm3, [state + _args_digest + 8*idx + 6*32] + pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + movdqa [job_rax + _result_digest + 2*16], xmm2 + movdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c new file mode 100644 index 000000000..74fa0384a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c @@ -0,0 +1,160 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA512(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + to_be64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA512(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + to_be64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c new file mode 100644 index 000000000..f71d06df8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c @@ -0,0 +1,203 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha512_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%016lX <=> 0x%016lX \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha512_mb test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%016lX <=> 0x%016lX\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha512_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha512_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%016lX" + ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c new file mode 100644 index 000000000..383c45cd2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c @@ -0,0 +1,300 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA512_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA512_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS]; + +extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha512 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha512_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha512_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha512_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8lX <=> %8lX", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha512_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha512_ctx_mgr_init(mgr); + + // Run sha512_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA512_BLOCK_SIZE + + SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA512_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, + HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha512_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha512_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA512_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha512_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8lX <=> %8lX\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c new file mode 100644 index 000000000..a84e7af3e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c @@ -0,0 +1,270 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS]; + +#define MSGS 8 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog"; +static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog."; +static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 }; +static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 }; + +static uint8_t msg5[] = { + 0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93, + 0x7f, 0x05, 0x41, 0x16, 0 +}; + +static uint8_t msg6[] = { + 0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed, + 0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0 +}; + +static uint8_t msg7[] = { + 0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff, + 0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90, + 0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85, + 0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39, + 0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb, + 0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64, + 0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44, + 0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27, + 0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9, + 0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e, + 0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93, + 0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0 +}; + +static uint8_t msg8[] = ""; + +static DigestSHA512 expResultDigest1 = { + 0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64, + 0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6 +}; + +static DigestSHA512 expResultDigest2 = { + 0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb, + 0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed +}; + +static DigestSHA512 expResultDigest3 = { + 0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849, + 0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b +}; + +static DigestSHA512 expResultDigest4 = { + 0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635, + 0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5 +}; + +static DigestSHA512 expResultDigest5 = { + 0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5, + 0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25 +}; + +static DigestSHA512 expResultDigest6 = { + 0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c, + 0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231 +}; + +static DigestSHA512 expResultDigest7 = { + 0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c, + 0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072 +}; + +static DigestSHA512 expResultDigest8 = { + 0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce, + 0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 }; + +static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2, + expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7, expResultDigest8 +}; + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint64_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha512_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } + } + + while (1) { + ctx = sha512_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the " + "submit. Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha512_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha512 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c new file mode 100644 index 000000000..87d7837f6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c @@ -0,0 +1,129 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 1000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 10 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS]; + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha512_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA512(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha512_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha512_ctx_mgr_submit(mgr, + &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha512_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha512" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + to_be64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha512 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf("multibinary_sha512_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm new file mode 100644 index 000000000..5d443faf7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm @@ -0,0 +1,442 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute SHA512 by-2 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux definitions +%define arg1 rdi +%define arg2 rsi +%else +; Windows definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPD vmovupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1 + vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0 +%endm + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA] + vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks) +;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used) +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +mk_global sha512_mb_x2_avx, function, internal +align 32 +sha512_mb_x2_avx: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[K512_2_MB] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ2], a + vmovdqa [rsp + _DIGEST + 1*SZ2], b + vmovdqa [rsp + _DIGEST + 2*SZ2], c + vmovdqa [rsp + _DIGEST + 3*SZ2], d + vmovdqa [rsp + _DIGEST + 4*SZ2], e + vmovdqa [rsp + _DIGEST + 5*SZ2], f + vmovdqa [rsp + _DIGEST + 6*SZ2], g + vmovdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + VMOVPD TT2,[inp1+IDX+i*16] + + TRANSPOSE TT0, TT2, TT1 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 8 * 16 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ2] + vpaddq b, b, [rsp + _DIGEST + 1*SZ2] + vpaddq c, c, [rsp + _DIGEST + 2*SZ2] + vpaddq d, d, [rsp + _DIGEST + 3*SZ2] + vpaddq e, e, [rsp + _DIGEST + 4*SZ2] + vpaddq f, f, [rsp + _DIGEST + 5*SZ2] + vpaddq g, g, [rsp + _DIGEST + 6*SZ2] + vpaddq h, h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + + ; outer calling routine restores XMM and other GP registers + ret + +section .data +K512_2_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + + +align 32 +; one from sha512_rorx +; this does the big endian to little endian conversion +; over a quad word +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm new file mode 100644 index 000000000..6c658023f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm @@ -0,0 +1,424 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute SHA512 by-2 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else +; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPD movupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + movapd %%t0, %%r0 ; t0 = a1 a0 + shufpd %%r0, %%r1, 00b ; r0 = b0 a0 + shufpd %%t0, %%r1, 11b ; t0 = b1 a1 +%endm + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psllq %%tmp, (64-(%%imm)) + psrlq %%reg, %%imm + por %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORQ a0, (18-14) ; sig1: a0 = (e >> 4) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORQ a1, 41 ; sig1: a1 = (e >> 41) + movdqa [SZ2*(%%i&0xf) + rsp],%%T1 + paddq %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + paddq h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORQ a2, (34-28) ; sig0: a2 = (a >> 6) + paddq h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORQ a1, 39 ; sig0: a1 = (a >> 39) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddq h, a0 + + paddq d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddq h, a1 ; h = h + ch + W + K + maj + paddq h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ2*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORQ %%T1, 8-1 + movdqa a2, a1 + PRORQ a1, 61-19 + pxor %%T1, a0 + PRORQ %%T1, 1 + pxor a1, a2 + PRORQ a1, 19 + psrlq a0, 7 + pxor %%T1, a0 + psrlq a2, 6 + pxor a1, a2 + paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp] + paddq a1, [SZ2*((%%i-7)&0xf) + rsp] + paddq %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks); +;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used) +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +mk_global sha512_mb_x2_sse, function, internal +align 32 +sha512_mb_x2_sse: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[K512_2_MB] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ2], a + movdqa [rsp + _DIGEST + 1*SZ2], b + movdqa [rsp + _DIGEST + 2*SZ2], c + movdqa [rsp + _DIGEST + 3*SZ2], d + movdqa [rsp + _DIGEST + 4*SZ2], e + movdqa [rsp + _DIGEST + 5*SZ2], f + movdqa [rsp + _DIGEST + 6*SZ2], g + movdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + MOVPD TT2,[inp1+IDX+i*16] + TRANSPOSE TT0, TT2, TT1 + pshufb TT0, TMP + pshufb TT1, TMP + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + add IDX, 8 * 16 ;; increment by a message block + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddq a, [rsp + _DIGEST + 0*SZ2] + paddq b, [rsp + _DIGEST + 1*SZ2] + paddq c, [rsp + _DIGEST + 2*SZ2] + paddq d, [rsp + _DIGEST + 3*SZ2] + paddq e, [rsp + _DIGEST + 4*SZ2] + paddq f, [rsp + _DIGEST + 5*SZ2] + paddq g, [rsp + _DIGEST + 6*SZ2] + paddq h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a + movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b + movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c + movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d + movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e + movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f + movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g + movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + ret + +section .data +align 64 +mk_global K512_2_MB, data, internal +K512_2_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm new file mode 100644 index 000000000..0058f33a6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm @@ -0,0 +1,487 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA512 using AVX2 +;; use YMMs to tackle the larger digest size +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r13 r14 r15 +;; +;; Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers ymm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi +%else +; Windows register definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r8 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define a0 ymm8 +%define a1 ymm9 +%define a2 ymm10 + +%define TT0 ymm14 +%define TT1 ymm13 +%define TT2 ymm12 +%define TT3 ymm11 +%define TT4 ymm10 +%define TT5 ymm9 + +%define T1 ymm14 +%define TMP ymm15 + +%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ4 + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ4 + .digest resb NUM_SHA512_DIGEST_WORDS*SZ4 + .align resb 24 +endstruc + +%define _DIGEST stack_frame.digest + +%define VMOVPD vmovupd + +; operates on YMMs +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} +; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} +; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} +; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + ; vshufps does not cross the mid-way boundary and hence is cheaper + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + + vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} + + vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} + + vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} + + ; now ok to clobber t0 + vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} + +%endmacro + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS + +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] + vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + +;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE) +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +mk_global sha512_mb_x4_avx2, function, internal +align 32 +sha512_mb_x4_avx2: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, stack_frame_size + + ;; Load the pre-transposed incoming digest. + vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE] + vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE] + vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE] + vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE] + vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE] + vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE] + vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE] + vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE] + + + lea TBL,[K512_4_MB] + + ;; load the address of each of the MAX_LANES (4) message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT2,[inp0+IDX+i*32] + VMOVPD TT1,[inp1+IDX+i*32] + VMOVPD TT4,[inp2+IDX+i*32] + VMOVPD TT3,[inp3+IDX+i*32] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 4 * 32 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ4] + vpaddq b, b, [rsp + _DIGEST + 1*SZ4] + vpaddq c, c, [rsp + _DIGEST + 2*SZ4] + vpaddq d, d, [rsp + _DIGEST + 3*SZ4] + vpaddq e, e, [rsp + _DIGEST + 4*SZ4] + vpaddq f, f, [rsp + _DIGEST + 5*SZ4] + vpaddq g, g, [rsp + _DIGEST + 6*SZ4] + vpaddq h, h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a + vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b + vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c + vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d + vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e + vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f + vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g + vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h + + ;; update input data pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, stack_frame_size + + ; outer calling routine restores XMM and other GP registers + ret + +section .data +align 64 +K512_4_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 + +align 32 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm new file mode 100644 index 000000000..a93fecb1b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm @@ -0,0 +1,644 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;; code to compute quad SHA512 using AVX512 +;; use ZMMs to tackle the larger digest size +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rsi +;; +;; Linux clobbers: rax rbx rcx rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdx rdi +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi ; usable + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var2 rdx ; arg2 + %define var1 rcx ; arg3 usable + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 +%define TBL r8 + +%define VMOVDQ32 vmovdqu32 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*8 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%define NUM_LANES 8 +%define SZ 8 +%define SZ8 8 * SZ +%define DIGEST_SZ 8 * SZ8 +%define DIGEST_SAVE NUM_LANES * DIGEST_SZ +%define RSP_SAVE 1*8 + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64 +FIELD _RSP, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +; from sha256_fips180-2.pdf +; define rotates for Sigma function for main loop steps +%define BIG_SIGMA_0_0 28 ; Sigma0 +%define BIG_SIGMA_0_1 34 +%define BIG_SIGMA_0_2 39 +%define BIG_SIGMA_1_0 14 ; Sigma1 +%define BIG_SIGMA_1_1 18 +%define BIG_SIGMA_1_2 41 + +; define rotates for Sigma function for scheduling steps + +%define SMALL_SIGMA_0_0 1 ; sigma0 +%define SMALL_SIGMA_0_1 8 +%define SMALL_SIGMA_0_2 7 +%define SMALL_SIGMA_1_0 19 ; sigma1 +%define SMALL_SIGMA_1_1 61 +%define SMALL_SIGMA_1_2 6 + +%define SHA_MAX_ROUNDS 80 +%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16) + +%macro TRANSPOSE8 12 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 +%define %%PERM_INDEX1 %11 +%define %%PERM_INDEX2 %12 + + +; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} + + ;; ;;; will not get clobbered + vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp + vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp + + ; process top half (r0..r3) {a...d} + vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0} + vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1} + vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0} + vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1} + + vmovdqa32 %%r1, %%t0 ; r1 and r3 free + vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0} + vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2} + + vmovdqa32 %%t1, %%r0 ; t1 and r3 free + vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1} + vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + + ;; Likewise for top half ; r2 and r3 free + vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0} + vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1} + vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0} + vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1} + + vmovdqa32 %%r5, %%r2 ; r5 and r7 free + vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0} + vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2} + + vmovdqa32 %%r7, %%r4 + vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + +;;; free r3, r6 + vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + +;;; t0 and r3 free + vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} + vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + + + vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + + ;; will re-order input to avoid move + ;vmovdqa32 %%r7, %%t0 + + ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} + ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} + ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + ; temp + ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + + + +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39 +;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41 +;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7 +;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6 + +;; Main processing loop per round +;; equivalent to %macro ROUND_00_15 2 +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddq T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + ;; compute BIG_SIGMA_1(E) + vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E) + vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E) + vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E) + vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E) + vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddq T1, T1, %%WT ; T1 = T1 + Wt + vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E) + vpaddq D, D, T1 ; D = D + T1 + vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A) + vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A) + vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A) + vmovdqa32 TMP0, A + vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A) + vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C) + vpaddq H, H, T1 ; H(A) = H(T2) + T1 + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2) + vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2) + vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2) + vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2) + + vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) + vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7 + + vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15) + vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15) + vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15) + vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15) + + vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) + + ; Wt-7 + sigma_0(Wt-15) + + +%endmacro + +align 64 + +; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha512_mb_x8_avx512) +sha512_mb_x8_avx512: + endbranch + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _RSP], rax + lea TBL,[TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*8*8] + vmovups B, [DIGEST + 1*8*8] + vmovups C, [DIGEST + 2*8*8] + vmovups D, [DIGEST + 3*8*8] + vmovups E, [DIGEST + 4*8*8] + vmovups F, [DIGEST + 5*8*8] + vmovups G, [DIGEST + 6*8*8] + vmovups H, [DIGEST + 7*8*8] + + xor IDX, IDX + ;; Read in input data address, saving them in registers because + ;; they will serve as variables, which we shall keep incrementing + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + +lloop: + + ;; first half of 1024 (need to transpose before use) + vmovups W0,[inp0 + IDX ] + vmovups W1,[inp1 + IDX ] + vmovups W2,[inp2 + IDX ] + vmovups W3,[inp3 + IDX ] + vmovups W4,[inp4 + IDX ] + vmovups W5,[inp5 + IDX ] + vmovups W6,[inp6 + IDX ] + vmovups TMP0,[inp7 + IDX ] + TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3 + ;; second half of 1024 (need to transpose before use) + vmovups W8,[inp0 + SZ8 + IDX ] + vmovups W9,[inp1 + SZ8 + IDX ] + vmovups W10,[inp2 + SZ8 + IDX ] + vmovups W11,[inp3 + SZ8 + IDX ] + vmovups W12,[inp4 + SZ8 + IDX ] + vmovups W13,[inp5 + SZ8 + IDX ] + vmovups W14,[inp6 + SZ8 + IDX ] + vmovups TMP0,[inp7 + SZ8 + IDX ] + TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3 + + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + vmovdqa32 TMP3, [TBL] ; First K + + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 128 ; increment by message block length in bytes + + + + +%assign I 0 +%rep 16 +;;; little endian to big endian + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first (max-rounds -16) + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep SHA_ROUNDS_LESS_16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + ; Check is this is the last block + sub SIZE, 1 + je lastLoop + + ; Process last 16 rounds + ; Read in next block msg data for use in first 16 words of msg sched +%assign I SHA_ROUNDS_LESS_16 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + ; Add old digest + vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] + + jmp lloop + + +lastLoop: + ; Process last 16 rounds +%assign I SHA_ROUNDS_LESS_16 +%assign J 0 + +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] + +;; update into data pointers +%assign I 0 +%rep 4 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + VMOVDQ32 [DIGEST + 0*8*8], A + VMOVDQ32 [DIGEST + 1*8*8], B + VMOVDQ32 [DIGEST + 2*8*8], C + VMOVDQ32 [DIGEST + 3*8*8], D + VMOVDQ32 [DIGEST + 4*8*8], E + VMOVDQ32 [DIGEST + 5*8*8], F + VMOVDQ32 [DIGEST + 6*8*8], G + VMOVDQ32 [DIGEST + 7*8*8], H + + mov rsp, [rsp + _RSP] + ret + + section .data +align 64 +; 80 constants for SHA512 +; replicating for each lane, thus 8*80 +; to aid in SIMD .. space tradeoff for time! +; local to asm file, used nowhere else +TABLE: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 + +align 64 +; this does the big endian to little endian conversion over a quad word .. ZMM +;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + dq 0x2021222324252627, 0x28292a2b2c2d2e2f + dq 0x3031323334353637, 0x38393a3b3c3d3e3f + +align 64 +TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_x8_avx512 +no_sha512_mb_x8_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm new file mode 100644 index 000000000..1113a1eea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm @@ -0,0 +1,252 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" + +;;;;; +; mbin_dispatch_init_avoton parameters +; Use this function when SSE/00/01 is a minimum requirement +; if AVOTON is true, then use avoton_func instead of sse_func +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +; 5-> AVOTON opt func +;;;;; +%macro mbin_dispatch_init_avoton 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + lea mbin_rdi, [%5 WRT_OPT] + and eax, FLAG_CPUID1_EAX_STEP_MASK + cmp eax, FLAG_CPUID1_EAX_AVOTON + ; If Avoton, set Avoton symbol and exit + cmove mbin_rsi, mbin_rdi + je _%1_init_done + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init6_avoton parameters +; if AVOTON is true, then use avoton_func instead of sse_func +; 1-> function name +; 2-> base function +; 3-> SSE4_1 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVOTON opt func +;;;;; +%macro mbin_dispatch_init6_avoton 7 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_1 + je _%1_init_done ; Use base function if no SSE4_1 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + lea mbin_rdi, [%7 WRT_OPT] + and eax, FLAG_CPUID1_EAX_STEP_MASK + cmp eax, FLAG_CPUID1_EAX_AVOTON + ; If Avoton, set Avoton symbol and exit + cmove mbin_rsi, mbin_rdi + je _%1_init_done + + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +default rel +[bits 64] + +%define def_wrd dq +%define wrd_sz qword +%define arg1 rsi + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha512_ctx_mgr_init_sse +extern sha512_ctx_mgr_submit_sse +extern sha512_ctx_mgr_flush_sse + +extern sha512_ctx_mgr_init_avx +extern sha512_ctx_mgr_submit_avx +extern sha512_ctx_mgr_flush_avx + +extern sha512_ctx_mgr_init_avx2 +extern sha512_ctx_mgr_submit_avx2 +extern sha512_ctx_mgr_flush_avx2 + +extern sha512_ctx_mgr_init_base +extern sha512_ctx_mgr_submit_base +extern sha512_ctx_mgr_flush_base + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha512_ctx_mgr_init_avx512 + extern sha512_ctx_mgr_submit_avx512 + extern sha512_ctx_mgr_flush_avx512 +%endif + +extern sha512_ctx_mgr_init_sb_sse4 +extern sha512_ctx_mgr_submit_sb_sse4 +extern sha512_ctx_mgr_flush_sb_sse4 + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha512_ctx_mgr_init +mbin_interface sha512_ctx_mgr_submit +mbin_interface sha512_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6 through replacing base by sse version + mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_base, \ + sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \ + sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \ + sha512_ctx_mgr_init_sb_sse4 + + mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_base, \ + sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \ + sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \ + sha512_ctx_mgr_submit_sb_sse4 + + mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_base, \ + sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \ + sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \ + sha512_ctx_mgr_flush_sb_sse4 +%else + mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \ + sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \ + sha512_ctx_mgr_init_sb_sse4 + + mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \ + sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \ + sha512_ctx_mgr_submit_sb_sse4 + + mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \ + sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \ + sha512_ctx_mgr_flush_sb_sse4 +%endif + + +;;; func core, ver, snum +slversion sha512_ctx_mgr_init, 00, 04, 0175 +slversion sha512_ctx_mgr_submit, 00, 04, 0176 +slversion sha512_ctx_mgr_flush, 00, 04, 0177 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c new file mode 100644 index 000000000..e9b156a33 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c @@ -0,0 +1,234 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha512_mb.h" +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA512 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define H0 0x6a09e667f3bcc908 +#define H1 0xbb67ae8584caa73b +#define H2 0x3c6ef372fe94f82b +#define H3 0xa54ff53a5f1d36f1 +#define H4 0x510e527fade682d1 +#define H5 0x9b05688c2b3e6c1f +#define H6 0x1f83d9abfb41bd6b +#define H7 0x5be0cd19137e2179 + +void sha512_single(const uint8_t * data, uint64_t digest[]); + +void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA512_BLOCK_SIZE]; + + /* 128 bit lengths not needed as len is uint32_t, so use 64 bit length + * and pad the first 64 bits with zeros. */ + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + digest[5] = H5; + digest[6] = H6; + digest[7] = H7; + + i = len; + /* Hash the complete blocks */ + while (i >= SHA512_BLOCK_SIZE) { + sha512_single(input_data, digest); + input_data += SHA512_BLOCK_SIZE; + i -= SHA512_BLOCK_SIZE; + } + + /* Copy remainder to a buffer to be padded */ + memcpy(buf, input_data, i); + buf[i++] = 0x80; + + // Pad more than required here and overwrite with length + for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++) + buf[j] = 0; + + if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE) + i = 2 * SHA512_BLOCK_SIZE; + else + i = SHA512_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + /* Hash the padded last block */ + sha512_single(buf, digest); + if (i == 256) + sha512_single(buf + 128, digest); +} + +/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words + * instead of 32 bit. + */ +#define ch(e,f,g) ((e & f) ^ (g & ~e)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) + +/* Sigma functions have same form as SHA256 but + * - change the word size to 64bit + * - change the amount to rotate + */ +#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r)))) + +/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case + * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions. + */ +#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39)) +#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41)) + +#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7)) +#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6)) + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be64(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha512_single(const uint8_t * data, uint64_t digest[]) +{ + /* Check these are all uint64_t */ + uint64_t a, b, c, d, e, f, g, h, t1, t2; + uint64_t w[16]; + uint64_t *ww = (uint64_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22); + step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc); + step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242); + step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe); + step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4); + step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5); + step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210); + step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f); + step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70); + step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc); + step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed); + step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df); + step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6); + step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364); + step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30); + step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218); + step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910); + step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a); + step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8); + step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53); + step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60); + step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72); + step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec); + step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28); + step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63 + step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c); + step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207); + step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e); + step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178); + step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba); + step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6); + step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae); + step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b); + step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84); + step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493); + step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc); + step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c); + step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6); + step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a); + step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec); + step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79 + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c new file mode 100644 index 000000000..6650b0106 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c @@ -0,0 +1,46 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +/* + * Function: sha512_sb_mgr_flush_sse4. + * + * Description: This is a dummy API. Nothing done here. + * + * Return: always NULL. + * + * */ +SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state) +{ + return NULL; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c new file mode 100644 index 000000000..69df5600d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c @@ -0,0 +1,38 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +// For single buffer APIs, nothing to be done here. +// This function is required, to comply with the usage of +// multi-buffer APIs. +void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state) +{ + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c new file mode 100644 index 000000000..96e1a5ee4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +/* + * Function: sha512_sb_mgr_submit_sse4 + * + * Description: Wrapper API for update routine of single buffer sha512, + * to comply with multi-buffer API. + * + * This function will pick up message/digest and length + * information from the argument "job", then call into + * sha512_sse4(). Argument "state" is passed in, but not + * really used here. + * + * Note: message init and padding is done outside. This function + * expects a packed buffer. + * + * Argument: state - not really used. + * job - contained message, digest, message length information, etc. + * + * Return: SHA512_JOB pointer. + * + **/ +SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job) +{ + assert(job != NULL); + + uint8_t *buff = job->buffer; + uint64_t *digest = job->result_digest, len = job->len; + + sha512_sse4((const void *)buff, (void *)digest, len); + + return job; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm new file mode 100644 index 000000000..8b43bce5e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm @@ -0,0 +1,396 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; Virtual Registers +%ifidn __OUTPUT_FORMAT__, win64 + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifidn __OUTPUT_FORMAT__, win64 + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + ror tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + ror tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_sse 1 +%assign %%t (%1) + + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + mov T1, f_64 + movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2] + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 ; XMM0 = W[t-2] + xor T1, g_64 + add T1, [WK_2(%%t)] + movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov tmp0, e_64 + ror tmp0, 23 ; 41 + movdqa xmm3, xmm5 ; XMM3 = W[t-15] + xor tmp0, e_64 + ror tmp0, 4 ; 18 + psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42 + xor tmp0, e_64 + ror tmp0, 14 ; 14 + psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1 + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15] + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov tmp0, a_64 + ror tmp0, 5 ; 39 + pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor tmp0, a_64 + ror tmp0, 6 ; 34 + pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor tmp0, a_64 + ror tmp0, 28 ; 28 + psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 ; XMM1 = W[t-2] + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 ; XMM4 = W[t-15] + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42 + add T1, [WK_2(%%t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7 + ror tmp0, 23 ; 41 + xor tmp0, e_64 + pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2] + ror tmp0, 4 ; 18 + xor tmp0, e_64 + pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15] + ror tmp0, 14 ; 14 + add T1, tmp0 + psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 ; XMM0 = s1(W[t-2]) + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + xor T2, tmp0 + pxor xmm3, xmm4 ; XMM3 = s0(W[t-15]) + mov tmp0, a_64 + paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror tmp0, 5 ; 39 + paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor tmp0, a_64 + paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror tmp0, 6 ; 34 + movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords + xor tmp0, a_64 + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + ror tmp0, 28 ; 28 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +mk_global sha512_sse4, function +sha512_sse4: + endbranch + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + movdqa xmm1, [XMM_QWORD_BSWAP] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + movdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS; Compute 2 Rounds + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_sse t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am new file mode 100644 index 000000000..8f8a3f4a6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am @@ -0,0 +1,121 @@ +######################################################################## +# Copyright(c) 2011-2020 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sm3_mb/sm3_ctx_base.c \ + sm3_mb/sm3_multibinary.asm + +lsrc_base_aliases += sm3_mb/sm3_ctx_base.c \ + sm3_mb/sm3_ctx_base_aliases.c + +lsrc_aarch64 += sm3_mb/sm3_ctx_base.c \ + sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c \ + sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S \ + sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c \ + sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c \ + sm3_mb/aarch64/sm3_mb_sm_x1.S \ + sm3_mb/aarch64/sm3_mb_sm_x2.S \ + sm3_mb/aarch64/sm3_mb_sm_x3.S \ + sm3_mb/aarch64/sm3_mb_sm_x4.S \ + sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c \ + sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c \ + sm3_mb/aarch64/sm3_mb_asimd_x1.S \ + sm3_mb/aarch64/sm3_mb_asimd_x4.S + + +src_include += -I $(srcdir)/sm3_mb + +extern_hdrs += include/sm3_mb.h \ + include/multi_buffer.h + +lsrc_x86_64 += sm3_mb/sm3_ctx_avx512.c \ + sm3_mb/sm3_mb_mgr_submit_avx512.asm \ + sm3_mb/sm3_mb_mgr_flush_avx512.asm \ + sm3_mb/sm3_mb_x16_avx512.asm + +lsrc_x86_64 += sm3_mb/sm3_ctx_avx2.c \ + sm3_mb/sm3_mb_mgr_submit_avx2.asm \ + sm3_mb/sm3_mb_mgr_flush_avx2.asm \ + sm3_mb/sm3_mb_x8_avx2.asm + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + include/reg_sizes.asm \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h \ + sm3_mb/sm3_job.asm \ + sm3_mb/sm3_mb_mgr_datastruct.asm \ + sm3_mb/sm3_test_helper.c + +check_tests += sm3_mb/sm3_ref_test + +unit_tests += sm3_mb/sm3_mb_rand_ssl_test \ + sm3_mb/sm3_mb_rand_test \ + sm3_mb/sm3_mb_rand_update_test \ + sm3_mb/sm3_mb_flush_test \ + sm3_mb/sm3_mb_test + +perf_tests += sm3_mb/sm3_mb_vs_ossl_perf \ + sm3_mb/sm3_mb_vs_ossl_shortage_perf + +sm3_mb_rand_ssl_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sm3_mb_rand_ssl_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_ssl_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_rand_update_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_update_test_LDFLAGS = -lcrypto + +sm3_mb_rand_update_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_update_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_flush_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_flush_test_LDFLAGS = -lcrypto + +sm3_mb_flush_test: sm3_test_helper.o +sm3_mb_sm3_mb_flush_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_rand_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_test_LDFLAGS = -lcrypto + +sm3_mb_rand_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_vs_ossl_perf: LDLIBS += -lcrypto +sm3_mb_sm3_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sm3_mb_vs_ossl_perf: sm3_test_helper.o +sm3_mb_sm3_mb_vs_ossl_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto + +sm3_mb_vs_ossl_shortage_perf: sm3_test_helper.o +sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..208a7414e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2019-2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_submit_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_submit_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_init_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_init_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_flush_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_flush_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S new file mode 100644 index 000000000..c7362de90 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S @@ -0,0 +1,387 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + + job .req x0 + len .req x1 + data .req x2 + digest .req x0 + + msg0 .req w3 + msg1 .req w4 + msg2 .req w5 + msg3 .req w6 + msg4 .req w7 + + msg .req w9 + msgP .req w10 + SS1 .req w11 + SS2 .req w12 + TT1 .req w13 + TT2 .req w14 + Tj .req w15 + tmp0 .req w19 + tmp1 .req w20 + dig_A .req w21 + dig_B .req w22 + dig_C .req w23 + dig_D .req w24 + dig_E .req w25 + dig_F .req w26 + dig_G .req w27 + dig_H .req w28 + + declare_var_vector_reg dig0,0 + declare_var_vector_reg dig1,1 + declare_var_vector_reg dig0_bak,2 + declare_var_vector_reg dig1_bak,3 + declare_var_vector_reg vect_msg0,4 + declare_var_vector_reg vect_msg1,5 + declare_var_vector_reg vect_msg2,6 + declare_var_vector_reg vect_msg3,7 + + declare_var_vector_reg vect_msgP0,16 + declare_var_vector_reg vect_msgP1,17 + declare_var_vector_reg vect_msgP2,18 + + + + + + +// round 0-11 +.macro sm3_round_0 round:req + ldr msg, [sp,msg_off+4*\round\()] + ldr msgP,[sp,wp_off +4*\round\()] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,msgP + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor TT1,TT1,dig_C + add SS2,SS2,dig_D + add SS1,SS1,dig_H + add TT1,TT1,SS2 + add TT2,TT2,SS1 + mov dig_D,dig_C + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,(32-1) +.endm + +//round 12-15 +.macro sm3_round_12 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,dig_D + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + eor TT1,TT1,dig_C + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +// round 16-62 +.macro sm3_round_16 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +//round 63 +.macro sm3_round_63 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ins vdig0_bak.s[3],dig_C + ror dig_C,dig_B,32-9 + ins vdig0_bak.s[1],dig_A + ins vdig0_bak.s[0],TT1 + ins vdig0_bak.s[2],dig_C + eor TT1,TT2,TT2,ror (32-17) + ins vdig1_bak.s[3],dig_G + ror dig_G,dig_F,32-19 + ins vdig1_bak.s[1],dig_E + ins vdig1_bak.s[2],dig_G + eor dig_E,TT1,TT2,ror(32-9) + ins vdig1_bak.s[0],dig_E +.endm + + .set wp_off , 96 + .set msg_off, 96 + 12*4 +#define STACK_SIZE 224 + .global sm3_mb_asimd_x1 + .type sm3_mb_asimd_x1, %function +sm3_mb_asimd_x1: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + ldr data,[job],64 + ldp qdig0,qdig1,[digest] + stp x19, x20, [sp, 16] + stp x21, x22, [sp, 32] + rev32 vdig0.16b,vdig0.16b + stp x23, x24, [sp, 48] + rev32 vdig1.16b,vdig1.16b + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + ble .exit_func + +.start_loop: + + /** prepare first 12 round data **/ + ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64 + mov Tj, 17689 + umov dig_A,vdig0.s[0] + movk Tj, 0x79cc, lsl 16 + rev32 vvect_msg0.16b,vvect_msg0.16b + umov dig_B,vdig0.s[1] + rev32 vvect_msg1.16b,vvect_msg1.16b + umov dig_C,vdig0.s[2] + rev32 vvect_msg2.16b,vvect_msg2.16b + umov dig_D,vdig0.s[3] + rev32 vvect_msg3.16b,vvect_msg3.16b + umov dig_E,vdig1.s[0] + stp qvect_msg0,qvect_msg1,[sp,msg_off] + umov dig_F,vdig1.s[1] + stp qvect_msg2,qvect_msg3,[sp,msg_off+32] + umov dig_G,vdig1.s[2] + eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b + eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b + umov dig_H,vdig1.s[3] + stp qvect_msgP0,qvect_msgP1,[sp,wp_off] + eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b + str qvect_msgP2,[sp,wp_off+32] + + sm3_round_0 0 + sm3_round_0 1 + sm3_round_0 2 + sm3_round_0 3 + sm3_round_0 4 + sm3_round_0 5 + sm3_round_0 6 + sm3_round_0 7 + sm3_round_0 8 + sm3_round_0 9 + sm3_round_0 10 + sm3_round_0 11 + + sm3_round_12 12 + sm3_round_12 13 + sm3_round_12 14 + sm3_round_12 15 + mov Tj, 0x7a87 + movk Tj, 0x9d8a, lsl 16 + sm3_round_16 16 + sm3_round_16 17 + sm3_round_16 18 + sm3_round_16 19 + sm3_round_16 20 + sm3_round_16 21 + sm3_round_16 22 + sm3_round_16 23 + sm3_round_16 24 + sm3_round_16 25 + sm3_round_16 26 + sm3_round_16 27 + sm3_round_16 28 + sm3_round_16 29 + sm3_round_16 30 + sm3_round_16 31 + sm3_round_16 32 + sm3_round_16 33 + sm3_round_16 34 + sm3_round_16 35 + sm3_round_16 36 + sm3_round_16 37 + sm3_round_16 38 + sm3_round_16 39 + sm3_round_16 40 + sm3_round_16 41 + sm3_round_16 42 + sm3_round_16 43 + sm3_round_16 44 + sm3_round_16 45 + sm3_round_16 46 + sm3_round_16 47 + sm3_round_16 48 + sm3_round_16 49 + sm3_round_16 50 + sm3_round_16 51 + sm3_round_16 52 + sm3_round_16 53 + sm3_round_16 54 + sm3_round_16 55 + sm3_round_16 56 + sm3_round_16 57 + sm3_round_16 58 + sm3_round_16 59 + sm3_round_16 60 + sm3_round_16 61 + sm3_round_16 62 + sm3_round_63 63 + subs len,len,1 + eor vdig0.16b,vdig0.16b,vdig0_bak.16b + eor vdig1.16b,vdig1.16b,vdig1_bak.16b + bne .start_loop +.exit_func: + ldp x19, x20, [sp, 16] + rev32 vdig0.16b,vdig0.16b + ldp x21, x22, [sp, 32] + rev32 vdig1.16b,vdig1.16b + ldp x23, x24, [sp, 48] + stp qdig0,qdig1,[digest] + ldp x25, x26, [sp, 64] + ldp x27, x28, [sp, 80] + ldp x29, x30, [sp], STACK_SIZE + ret + .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S new file mode 100644 index 000000000..975a07c7a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S @@ -0,0 +1,576 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + job0_tmp .req x10 + job1_tmp .req x11 + job2_tmp .req x12 + job3_tmp .req x13 + const_adr .req x14 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg msg5,5 + declare_var_vector_reg msg6,6 + declare_var_vector_reg msg7,7 + declare_var_vector_reg msg8,8 + declare_var_vector_reg msg9,9 + declare_var_vector_reg msg10,10 + declare_var_vector_reg msg11,11 + declare_var_vector_reg msg12,12 + declare_var_vector_reg msg13,13 + declare_var_vector_reg msg14,14 + declare_var_vector_reg msg15,15 + declare_var_vector_reg msg16,16 + + + declare_var_vector_reg dig_A,24 + declare_var_vector_reg dig_B,25 + declare_var_vector_reg dig_C,26 + declare_var_vector_reg dig_D,27 + declare_var_vector_reg dig_E,28 + declare_var_vector_reg dig_F,29 + declare_var_vector_reg dig_G,30 + declare_var_vector_reg dig_H,31 + + declare_var_vector_reg TT1,17 + declare_var_vector_reg TT2,18 + declare_var_vector_reg SS1,19 + declare_var_vector_reg SS2,20 + declare_var_vector_reg tmp0,21 + declare_var_vector_reg word_pair,23 + declare_var_vector_reg Tj,22 + + +.macro rol32 target:req,reg:req,bit:req + ushr v\target\().4s,v\reg\().4s,32 - \bit + sli v\target\().4s,v\reg\().4s,\bit +.endm + +// round 0-11 +.macro sm3_round_0 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\round\().16b,vmsg\round\().16b + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + + +.macro sm3_round_4 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + +//round 12-15 +.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +// round 16-62 +.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + mov vTT2.16b,vdig_E.16b + orr vTT1.16b,vdig_B.16b,vdig_C.16b + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +//round 63 +.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + + ldp qmsg0,qmsg1,[sp,dig_off+ 0] + mov vTT2.16b,vdig_E.16b + ldp qmsg2,qmsg3,[sp,dig_off+ 32] + orr vTT1.16b,vdig_B.16b,vdig_C.16b + ldp qmsg4,qmsg5,[sp,dig_off+ 64] + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + ldp qmsg6,qmsg7,[sp,dig_off+ 96] + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + //D=C + eor vdig_D.16b,vdig_C.16b,vmsg3.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + eor vdig_C.16b,vdig_C.16b,vmsg2.16b + //B=A + eor vdig_B.16b,vdig_A.16b,vmsg1.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + //A=TT1 + eor vdig_A.16b,vTT1.16b,vmsg0.16b + // H=G + eor vdig_H.16b,vdig_G.16b,vmsg7.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + eor vdig_G.16b,vdig_G.16b,vmsg6.16b + //F = E + eor vdig_F.16b,vdig_E.16b,vmsg5.16b + stp qdig_G,qdig_H,[sp,dig_off+ 96] + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + eor vdig_E.16b, vdig_E.16b, vmsg4.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] +.endm + + .set dig_off , 80 + +#define STACK_SIZE 224 + .global sm3_mb_asimd_x4 + .type sm3_mb_asimd_x4, %function +sm3_mb_asimd_x4: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + //push d8~d15 + ldr job0_data, [job0],64 + stp d8,d9, [sp,16] + ldr job1_data, [job1],64 + stp d10,d11,[sp,32] + ldr job2_data, [job2],64 + stp d12,d13,[sp,48] + ldr job3_data, [job3],64 + stp d14,d15,[sp,64] + ble .exit_func + + mov job0_tmp,job0_digest + mov job1_tmp,job1_digest + mov job2_tmp,job2_digest + mov job3_tmp,job3_digest + //load digests + ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16 + adrp const_adr, .consts + ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16 + add const_adr, const_adr, #:lo12:.consts + ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp] + rev32 vdig_A.16b,vdig_A.16b + ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp] + rev32 vdig_B.16b,vdig_B.16b + ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp] + rev32 vdig_C.16b,vdig_C.16b + ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp] + rev32 vdig_D.16b,vdig_D.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] + stp qdig_G,qdig_H,[sp,dig_off+ 96] + +.start_loop: + ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16 + ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16 + ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16 + ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16 + ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16 + ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16 + ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16 + ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16 + ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16 + ldr qTj,[const_adr] + + sm3_round_0 0, 4 + + ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16 + sm3_round_0 1, 5 + + ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16 + sm3_round_0 2, 6 + ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16 + sm3_round_0 3, 7 + + ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16 + + sm3_round_4 4, 8 + ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16 + sm3_round_4 5, 9 + ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16 + sm3_round_4 6,10 + ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16 + sm3_round_4 7,11 + sm3_round_4 8,12 + sm3_round_4 9,13 + sm3_round_4 10,14 + sm3_round_4 11,15 + + sm3_round_12 12,16, 0, 7,13, 3,10 //12 + sm3_round_12 13, 0, 1, 8,14, 4,11 //13 + sm3_round_12 14, 1, 2, 9,15, 5,12 //14 + sm3_round_12 15, 2, 3,10,16, 6,13 //15 + + ldr qTj,[const_adr,16] + sm3_round_16 16, 3, 4,11, 0, 7,14 //16 +#if 0 + stp sdig_A,sdig_B,[job0_digest] + stp sdig_C,sdig_D,[job0_digest,8] + stp sdig_E,sdig_F,[job0_digest,16] + stp sdig_G,sdig_H,[job0_digest,24] + b .exit_func +#endif + sm3_round_16 0, 4, 5,12, 1, 8,15 //17 + + sm3_round_16 1, 5, 6,13, 2, 9,16 //18 + sm3_round_16 2, 6, 7,14, 3,10, 0 //19 + sm3_round_16 3, 7, 8,15, 4,11, 1 //20 + sm3_round_16 4, 8, 9,16, 5,12, 2 //21 + sm3_round_16 5, 9,10, 0, 6,13, 3 //22 + sm3_round_16 6,10,11, 1, 7,14, 4 //23 + sm3_round_16 7,11,12, 2, 8,15, 5 //24 + sm3_round_16 8,12,13, 3, 9,16, 6 //25 + sm3_round_16 9,13,14, 4,10, 0, 7 //26 + sm3_round_16 10,14,15, 5,11, 1, 8 //27 + sm3_round_16 11,15,16, 6,12, 2, 9 //28 + sm3_round_16 12,16, 0, 7,13, 3,10 //29 + sm3_round_16 13, 0, 1, 8,14, 4,11 //30 + sm3_round_16 14, 1, 2, 9,15, 5,12 //31 + sm3_round_16 15, 2, 3,10,16, 6,13 //32 + sm3_round_16 16, 3, 4,11, 0, 7,14 //33 + sm3_round_16 0, 4, 5,12, 1, 8,15 //34 + sm3_round_16 1, 5, 6,13, 2, 9,16 //35 + sm3_round_16 2, 6, 7,14, 3,10, 0 //36 + sm3_round_16 3, 7, 8,15, 4,11, 1 //37 + sm3_round_16 4, 8, 9,16, 5,12, 2 //38 + sm3_round_16 5, 9,10, 0, 6,13, 3 //39 + sm3_round_16 6,10,11, 1, 7,14, 4 //40 + sm3_round_16 7,11,12, 2, 8,15, 5 //41 + sm3_round_16 8,12,13, 3, 9,16, 6 //42 + sm3_round_16 9,13,14, 4,10, 0, 7 //43 + sm3_round_16 10,14,15, 5,11, 1, 8 //44 + sm3_round_16 11,15,16, 6,12, 2, 9 //45 + sm3_round_16 12,16, 0, 7,13, 3,10 //46 + sm3_round_16 13, 0, 1, 8,14, 4,11 //47 + sm3_round_16 14, 1, 2, 9,15, 5,12 //48 + sm3_round_16 15, 2, 3,10,16, 6,13 //49 + sm3_round_16 16, 3, 4,11, 0, 7,14 //50 + sm3_round_16 0, 4, 5,12, 1, 8,15 //51 + sm3_round_16 1, 5, 6,13, 2, 9,16 //52 + sm3_round_16 2, 6, 7,14, 3,10, 0 //53 + sm3_round_16 3, 7, 8,15, 4,11, 1 //54 + sm3_round_16 4, 8, 9,16, 5,12, 2 //55 + sm3_round_16 5, 9,10, 0, 6,13, 3 //56 + sm3_round_16 6,10,11, 1, 7,14, 4 //57 + sm3_round_16 7,11,12, 2, 8,15, 5 //58 + sm3_round_16 8,12,13, 3, 9,16, 6 //59 + sm3_round_16 9,13,14, 4,10, 0, 7 //60 + sm3_round_16 10,14,15, 5,11, 1, 8 //61 + sm3_round_16 11,15,16, 6,12, 2, 9 //62 + sm3_round_63 12,16, 0, 7,13, 3,10 //63 + + subs len,len,1 + bne .start_loop + + //save digests with big endian + rev32 vdig_A.16b,vdig_A.16b + rev32 vdig_B.16b,vdig_B.16b + rev32 vdig_C.16b,vdig_C.16b + rev32 vdig_D.16b,vdig_D.16b + st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16 + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16 + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16 + st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16 + st4 {vdig_E.s-vdig_H.s}[0],[job0_digest] + st4 {vdig_E.s-vdig_H.s}[1],[job1_digest] + st4 {vdig_E.s-vdig_H.s}[2],[job2_digest] + st4 {vdig_E.s-vdig_H.s}[3],[job3_digest] + +.exit_func: + ldp d8, d9, [sp,16] + ldp d10,d11,[sp,32] + ldp d12,d13,[sp,48] + ldp d14,d15,[sp,64] + ldp x29, x30, [sp], STACK_SIZE + ret +.consts: + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c new file mode 100644 index 000000000..6e1dff45e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c @@ -0,0 +1,246 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +#define SM3_LOG2_BLOCK_SIZE 6 +void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state); +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_ctx_mgr_init_asimd(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_asimd(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_asimd(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_asimd(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_asimd(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define cpu_to_be32(v) (((v&0xff000000)>>24) | ((v&0xff0000)>>8) | ((v&0xff00)<<8) | ((v&0xff)<<24)) +#else +#define cpu_to_be32(v) +#endif +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { cpu_to_be32(0x7380166f), cpu_to_be32(0x4914b2b9), + cpu_to_be32(0x172442d7), cpu_to_be32(0xda8a0600), + cpu_to_be32(0xa96f30bc), cpu_to_be32(0x163138aa), + cpu_to_be32(0xe38dee4d), cpu_to_be32(0xb0fb0e4e) + }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 + + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c new file mode 100644 index 000000000..5af9ead38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c @@ -0,0 +1,241 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +#define SM3_LOG2_BLOCK_SIZE 6 +void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state); +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_ctx_mgr_init_sm(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_sm(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_sm(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_sm(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_sm(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { to_be32(0x7380166f), to_be32(0x4914b2b9), + to_be32(0x172442d7), to_be32(0xda8a0600), + to_be32(0xa96f30bc), to_be32(0x163138aa), + to_be32(0xe38dee4d), to_be32(0xb0fb0e4e) + }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 + + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c new file mode 100644 index 000000000..48a0d4d0e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c @@ -0,0 +1,188 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SM3_MB_CE_MAX_LANES 4 +void sm3_mb_asimd_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +void sm3_mb_asimd_x1(SM3_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SM3_MB_CE_MAX_LANES is invalid lane + for (; i < SM3_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state) +{ + int lane_idx, len, i; + + if (state->num_lanes_inuse == 0) { + return -1; + } + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + sm3_mb_asimd_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + //only return the min length job + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + } else { + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + len = state->lens[i] & (~0xf); + sm3_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4); + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + return i; + } + } + } + return -1; + +} + +static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state) +{ + int i; + SM3_JOB *ret = NULL; + + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SM3_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SM3_JOB *ret; + + //add job into lanes + sm3_mb_mgr_insert_job(state, job); + + ret = sm3_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sm3_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sm3_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sm3_mb_mgr_free_lane(state); + return ret; +} + +SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state) +{ + SM3_JOB *ret; + ret = sm3_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sm3_mb_mgr_do_jobs(state); + return sm3_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c new file mode 100644 index 000000000..a7178e0be --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SM3_MB_CE_MAX_LANES 4 +#if SM3_MB_CE_MAX_LANES >=4 +void sm3_mb_sm_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +#endif +#if SM3_MB_CE_MAX_LANES >=3 +void sm3_mb_sm_x3(SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +#endif +#if SM3_MB_CE_MAX_LANES >=2 +void sm3_mb_sm_x2(SM3_JOB *, SM3_JOB *, int); +#endif +void sm3_mb_sm_x1(SM3_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SM3_MB_CE_MAX_LANES is invalid lane + for (; i < SM3_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SM3_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SM3_MB_CE_MAX_LANES == 4 + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + + sm3_mb_sm_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + + } else +#elif SM3_MB_CE_MAX_LANES == 3 + if (state->num_lanes_inuse == 3) { + len = min(min(state->lens[0], state->lens[1]), state->lens[2]); + lane_idx = len & 0xf; + len &= ~0xf; + + sm3_mb_sm_x3(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, len >> 4); + + } else +#elif SM3_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + sm3_mb_sm_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SM3_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); +#if SM3_MB_CE_MAX_LANES >=4 + if (lanes == 4) { + sm3_mb_sm_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, len >> 4); + } else +#endif +#if SM3_MB_CE_MAX_LANES >=3 + if (lanes == 3) { + sm3_mb_sm_x3(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, len >> 4); + } else +#endif +#if SM3_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sm3_mb_sm_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sm3_mb_sm_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state) +{ + int i; + SM3_JOB *ret = NULL; + + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SM3_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SM3_JOB *ret; + + //add job into lanes + sm3_mb_mgr_insert_job(state, job); + + ret = sm3_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sm3_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sm3_mb_mgr_do_jobs(state); +#endif + + ret = sm3_mb_mgr_free_lane(state); + return ret; +} + +SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state) +{ + SM3_JOB *ret; + ret = sm3_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sm3_mb_mgr_do_jobs(state); + return sm3_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S new file mode 100644 index 000000000..836bd9ccc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface sm3_ctx_mgr_submit +mbin_interface sm3_ctx_mgr_init +mbin_interface sm3_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S new file mode 100644 index 000000000..f92ac5e9f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S @@ -0,0 +1,237 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + ext v\msg4\().16b, v\msg1\().16b, v\msg2\().16b, #12 + ext v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b, #12 + ext v\tmp1\().16b, v\msg2\().16b, v\msg3\().16b, #8 + sm3partw1 v\msg4\().4s, v\msg0\().4s, v\msg3\().4s + sm3partw2 v\msg4\().4s, v\tmp1\().4s, v\tmp0\().4s + +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req + eor v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b + + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[0] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[0] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[1] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[1] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[2] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[2] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[3] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[3] + +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + job .req x0 + len .req x1 + data .req x2 + digest .req x0 + end_ptr .req x1 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg dig0,5 + declare_var_vector_reg dig1,6 + declare_var_vector_reg backup_dig0, 7 + + declare_var_vector_reg tmp0,16 + declare_var_vector_reg tmp1,17 + declare_var_vector_reg backup_dig1, 18 + + declare_var_vector_reg const0,19 + declare_var_vector_reg const1,20 + declare_var_vector_reg const2,21 + declare_var_vector_reg const3,22 + declare_var_vector_reg const4,23 + declare_var_vector_reg const5,24 + declare_var_vector_reg const6,25 + declare_var_vector_reg const7,26 + declare_var_vector_reg const8,27 + declare_var_vector_reg const9,28 + declare_var_vector_reg const10,29 + declare_var_vector_reg const11,30 + + + + + .global sm3_mb_sm_x1 + .type sm3_mb_sm_x1, %function +sm3_mb_sm_x1: + adrp x3,.consts + ldr data, [job],64 + add x3,x3,:lo12:.consts + ldp qdig0,qdig1,[digest] + ld1 {vconst0.16b-vconst3.16b},[x3],64 + add end_ptr,data,len,lsl 6 + ld1 {vconst4.16b-vconst7.16b},[x3],64 + //rev128 + ext vdig0.16b,vdig0.16b,vdig0.16b,#8 + ext vdig1.16b,vdig1.16b,vdig1.16b,#8 + ld1 {vconst8.16b-vconst11.16b},[x3],64 + rev64 vdig0.16b,vdig0.16b + rev64 vdig1.16b,vdig1.16b + + +start_loop: + mov vbackup_dig0.16b,vdig0.16b + mov vbackup_dig1.16b,vdig1.16b + ldp qmsg0,qmsg1,[data],32 + ldp qmsg2,qmsg3,[data],32 + + // big-endian to little-endian + rev32 vmsg0.16b,vmsg0.16b + rev32 vmsg1.16b,vmsg1.16b + rev32 vmsg2.16b,vmsg2.16b + rev32 vmsg3.16b,vmsg3.16b + + quad_round_expand a, const0, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand a, const1, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand a, const2, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const4, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const6, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand b, const8, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const4, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + cmp data,end_ptr + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1 + + eor vdig0.16b,vdig0.16b,vbackup_dig0.16b + eor vdig1.16b,vdig1.16b,vbackup_dig1.16b + + + bcc start_loop + + //rev128 + ext vdig0.16b,vdig0.16b,vdig0.16b,#8 + ext vdig1.16b,vdig1.16b,vdig1.16b,#8 + rev64 vdig0.16b,vdig0.16b + rev64 vdig1.16b,vdig1.16b + str qdig0,[digest] + str qdig1,[digest,16] + ret + dsb ish + isb + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x1, .-sm3_mb_sm_x1 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S new file mode 100644 index 000000000..4e4a6e738 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S @@ -0,0 +1,344 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 v\job\()_\msg4\().4s, v\job\()_\msg0\().4s, v\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 v\job\()_\msg4\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1 + do_ext job\j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1 + do_ext job\j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1 + do_ext job\j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1 + do_sm3partw1 job\j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1 + do_sm3partw2 job\j,\msg4, \tmp1, \tmp0 + .endr + +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req + .irp j,0,1 + do_eor job\j,\tmp0,\msg0,\msg1 + .endr + .irp lane,0,1,2,3 + .irp j,0,1 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + .irp j,0,1 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + .endr +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + +/* + Variables +*/ + job0 .req x0 + job1 .req x1 + len .req x2 + + job0_data .req x3 + job1_data .req x4 + job0_digest .req x0 + job1_digest .req x1 + + const_adr .req x5 + end_ptr .req x2 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job0_msg1, 1 + declare_var_vector_reg job0_msg2, 2 + declare_var_vector_reg job0_msg3, 3 + declare_var_vector_reg job0_msg4, 4 + declare_var_vector_reg job0_dig0, 5 + declare_var_vector_reg job0_dig1, 6 + declare_var_vector_reg job0_tmp0, 7 + declare_var_vector_reg job0_tmp1, 8 + declare_var_vector_reg job0_backup_dig0, 9 + declare_var_vector_reg job0_backup_dig1, 10 + + declare_var_vector_reg job1_msg0, 11 + declare_var_vector_reg job1_msg1, 12 + declare_var_vector_reg job1_msg2, 13 + declare_var_vector_reg job1_msg3, 14 + declare_var_vector_reg job1_msg4, 15 + declare_var_vector_reg job1_dig0, 16 + declare_var_vector_reg job1_dig1, 17 + declare_var_vector_reg job1_tmp0, 18 + declare_var_vector_reg job1_tmp1, 19 + declare_var_vector_reg job1_backup_dig0, 20 + declare_var_vector_reg job1_backup_dig1, 21 + + declare_var_vector_reg const0, 22 + declare_var_vector_reg const1, 23 + declare_var_vector_reg const2, 24 + declare_var_vector_reg const3, 25 + declare_var_vector_reg const4, 26 + declare_var_vector_reg const5, 27 + declare_var_vector_reg const6, 28 + declare_var_vector_reg const7, 29 + declare_var_vector_reg const8, 30 + declare_var_vector_reg const9, 31 + declare_var_vector_reg const10, 22 + declare_var_vector_reg const11, 23 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm +.macro rev32_msgs + .irp j,0,1 + do_rev32_job job\j + .endr +.endm + + + .global sm3_mb_sm_x2 + .type sm3_mb_sm_x2, %function +sm3_mb_sm_x2: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + adrp const_adr,.consts + ldr job0_data, [job0],64 + add const_adr,const_adr,:lo12:.consts + ldr job1_data, [job1],64 + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + + ldp qconst2,qconst3,[const_adr,32] + ldp qconst4,qconst5,[const_adr,64] + ldp qconst6,qconst7,[const_adr,96] + ldp qconst8,qconst9,[const_adr,128] + add end_ptr,job0_data,len,lsl 6 + + //rev128 + ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8 + ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8 + rev64 vjob0_dig0.16b,vjob0_dig0.16b + rev64 vjob0_dig1.16b,vjob0_dig1.16b + ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8 + ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8 + rev64 vjob1_dig0.16b,vjob1_dig0.16b + rev64 vjob1_dig1.16b,vjob1_dig1.16b + + + + + +start_loop: + + ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64 + ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64 + + mov vjob0_backup_dig0.16b,vjob0_dig0.16b + mov vjob0_backup_dig1.16b,vjob0_dig1.16b + mov vjob1_backup_dig0.16b,vjob1_dig0.16b + mov vjob1_backup_dig1.16b,vjob1_dig1.16b + + // const10,const11,const0,const1 share registers + ldp qconst0,qconst1,[const_adr] + + // big-endian to little-endian + rev32_msgs + + cmp job0_data,end_ptr + quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + + + quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + // const10,const11,const0,const1 share registers + ldp qconst10,qconst11,[const_adr,160] + quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1 + + eor vjob0_dig0.16b,vjob0_dig0.16b,vjob0_backup_dig0.16b + eor vjob0_dig1.16b,vjob0_dig1.16b,vjob0_backup_dig1.16b + eor vjob1_dig0.16b,vjob1_dig0.16b,vjob1_backup_dig0.16b + eor vjob1_dig1.16b,vjob1_dig1.16b,vjob1_backup_dig1.16b + + + bcc start_loop + + //rev128 + ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8 + ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8 + rev64 vjob0_dig0.16b,vjob0_dig0.16b + rev64 vjob0_dig1.16b,vjob0_dig1.16b + stp qjob0_dig0,qjob0_dig1,[job0_digest] + + ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8 + ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8 + rev64 vjob1_dig0.16b,vjob1_dig0.16b + rev64 vjob1_dig1.16b,vjob1_dig1.16b + stp qjob1_dig0,qjob1_dig1,[job1_digest] + +#if 1 + mov v0.16b,vjob1_dig0.16b + mov v1.16b,vjob1_dig1.16b + b exit_ret +#endif + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x2, .-sm3_mb_sm_x2 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S new file mode 100644 index 000000000..58758f98d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S @@ -0,0 +1,368 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + //dsdf + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1,2 + do_ext \j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1,2 + do_ext \j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1,2 + do_ext \j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1,2 + do_sm3partw1 \j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1,2 + do_sm3partw2 \j,\msg4, \tmp1, \tmp0 + .endr + +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm +.macro do_ld_backup_digest job + ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off] +.endm + +.macro do_st_digest job + stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest] +.endm +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,load_digest + .irp j,0,1,2 + do_eor job\j,\tmp0,\msg0,\msg1 + .ifnb \load_digest + do_ld_backup_digest \j + .endif + .endr + .irp lane,0,1,2,3 + .irp j,0,1,2 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1,2 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + .irp j,0,1,2 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + + .endr +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + +/* + Variables +*/ + job0 .req x0 + job1 .req x1 + job2 .req x2 + len .req x3 + + job0_data .req x4 + job1_data .req x5 + job2_data .req x6 + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + + const_adr .req x7 + end_ptr .req x3 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job0_msg1, 1 + declare_var_vector_reg job0_msg2, 2 + declare_var_vector_reg job0_msg3, 3 + declare_var_vector_reg job0_msg4, 4 + declare_var_vector_reg job0_dig0, 5 + declare_var_vector_reg job0_dig1, 6 + declare_var_vector_reg job0_tmp0, 7 + declare_var_vector_reg job0_tmp1, 8 + .set job0_dig_off, 64 + declare_var_vector_reg job0_backup_dig0, 2 + declare_var_vector_reg job0_backup_dig1, 3 + + declare_var_vector_reg job1_msg0, 9 + declare_var_vector_reg job1_msg1, 10 + declare_var_vector_reg job1_msg2, 11 + declare_var_vector_reg job1_msg3, 12 + declare_var_vector_reg job1_msg4, 13 + declare_var_vector_reg job1_dig0, 14 + declare_var_vector_reg job1_dig1, 15 + declare_var_vector_reg job1_tmp0, 16 + declare_var_vector_reg job1_tmp1, 17 + .set job1_dig_off, 96 + declare_var_vector_reg job1_backup_dig0, 11 + declare_var_vector_reg job1_backup_dig1, 12 + + declare_var_vector_reg job2_msg0, 18 + declare_var_vector_reg job2_msg1, 19 + declare_var_vector_reg job2_msg2, 20 + declare_var_vector_reg job2_msg3, 21 + declare_var_vector_reg job2_msg4, 22 + declare_var_vector_reg job2_dig0, 23 + declare_var_vector_reg job2_dig1, 24 + declare_var_vector_reg job2_tmp0, 25 + declare_var_vector_reg job2_tmp1, 26 + .set job2_dig_off, 128 + declare_var_vector_reg job2_backup_dig0, 20 + declare_var_vector_reg job2_backup_dig1, 21 + + + declare_var_vector_reg const0, 27 + declare_var_vector_reg const1, 28 + declare_var_vector_reg const2, 29 + declare_var_vector_reg const3, 30 + declare_var_vector_reg const4, 27 + declare_var_vector_reg const5, 28 + declare_var_vector_reg const6, 29 + declare_var_vector_reg const7, 30 + declare_var_vector_reg const8, 27 + declare_var_vector_reg const9, 28 + declare_var_vector_reg const10, 29 + declare_var_vector_reg const11, 30 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm +.macro rev32_msgs + .irp j,0,1,2 + do_rev32_job job\j + .endr +.endm + +.macro do_rev64 job,regd,regn + rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b +.endm + + .global sm3_mb_sm_x3 + .type sm3_mb_sm_x3, %function +sm3_mb_sm_x3: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + adrp const_adr,.consts + ldr job0_data, [job0],64 + add const_adr,const_adr,:lo12:.consts + ldr job1_data, [job1],64 + ldr job2_data, [job2],64 + + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + ldp qjob2_dig0,qjob2_dig1,[job2_digest] + ld1 {vconst0.16b-vconst3.16b},[const_adr] + add end_ptr,job0_data,len,lsl 6 + + //rev128 + .irp j,0,1,2 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + .endr + + + + + +start_loop: + + ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64 + stp qjob0_dig0,qjob0_dig1,[sp,job0_dig_off] + ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64 + stp qjob1_dig0,qjob1_dig1,[sp,job1_dig_off] + ld1 {vjob2_msg0.16b-vjob2_msg3.16b},[job2_data],64 + stp qjob2_dig0,qjob2_dig1,[sp,job2_dig_off] + + cmp job0_data,end_ptr + + // big-endian to little-endian + rev32_msgs + + quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + + ldp qconst4,qconst5,[const_adr,4*16] + quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + + ldp qconst6,qconst7,[const_adr,6*16] + quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + ldp qconst8,qconst9,[const_adr,8*16] + quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + ldp qconst10,qconst11,[const_adr,10*16] + quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + + ldp qconst4,qconst5,[const_adr,4*16] + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + ldp qconst6,qconst7,[const_adr,6*16] + quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + ldp qconst0,qconst1,[const_adr] + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1,1 + ldp qconst2,qconst3,[const_adr,2*16] + + .irp j,0,1,2 + do_eor job\j,dig0,dig0,backup_dig0 + do_eor job\j,dig1,dig1,backup_dig1 + .endr + + bcc start_loop + + //rev128 + .irp j,0,1,2 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + do_st_digest \j + .endr + + + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x3, .-sm3_mb_sm_x3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S new file mode 100644 index 000000000..7f3f1db66 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S @@ -0,0 +1,440 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1,2,3 + do_ext \j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1,2,3 + do_ext \j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1,2,3 + do_ext \j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1,2,3 + do_sm3partw1 \j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1,2,3 + do_sm3partw2 \j,\msg4, \tmp1, \tmp0 + .endr + st1 {vjob0_\msg4\().16b-vjob3_\msg4\().16b},[data_buf],64 +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm +.macro do_ld_backup_digest job + ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off] +.endm + +.macro do_st_digest job + stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest] +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,is_last + .ifnb \is_last + ld1 {vjob0_backup_dig0.16b-vjob3_backup_dig0.16b},[dig_buf],64 + .endif + + .irp j,0,1,2,3 + do_eor job\j,\tmp0,\msg0,\msg1 + + .endr + + .irp lane,0,1,2 + .irp j,0,1,2,3 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1,2,3 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + .irp j,0,1,2,3 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + + + .endr + .irp j,0,1,2,3 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + .ifnb \is_last + + ld1 {vjob0_backup_dig1.16b-vjob3_backup_dig1.16b},[dig_buf] + .else + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .endif + .irp j,0,1,2,3 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,3 + .endr + + .irp j,0,1,2,3 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,3 + .ifnb \is_last + do_eor job\j,dig1,dig1,backup_dig1 + do_eor job\j,dig0,dig0,backup_dig0 + .endif + .endr + + .ifb \is_last + ld1 {vjob0_\msg0\().16b-vjob3_\msg0\().16b},[data_buf],64 + .endif + +.endm + + + +/* + Variables +*/ + .set temp_buf_size,(68*4+32)*4 + .set dig_buf_off,64 + .set data_buf_off,64+32*4 + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + + const_adr .req x10 + end_ptr .req x4 + data_buf .req x11 + dig_buf .req x12 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job1_msg0, 1 + declare_var_vector_reg job2_msg0, 2 + declare_var_vector_reg job3_msg0, 3 + declare_var_vector_reg job0_msg1, 4 + declare_var_vector_reg job1_msg1, 5 + declare_var_vector_reg job2_msg1, 6 + declare_var_vector_reg job3_msg1, 7 + declare_var_vector_reg job0_msg2, 8 + declare_var_vector_reg job1_msg2, 9 + declare_var_vector_reg job2_msg2, 10 + declare_var_vector_reg job3_msg2, 11 + declare_var_vector_reg job0_msg3, 12 + declare_var_vector_reg job1_msg3, 13 + declare_var_vector_reg job2_msg3, 14 + declare_var_vector_reg job3_msg3, 15 + declare_var_vector_reg job0_tmp0, 16 + declare_var_vector_reg job1_tmp0, 17 + declare_var_vector_reg job2_tmp0, 18 + declare_var_vector_reg job3_tmp0, 19 + declare_var_vector_reg job0_tmp1, 20 + declare_var_vector_reg job1_tmp1, 21 + declare_var_vector_reg job2_tmp1, 22 + declare_var_vector_reg job3_tmp1, 23 + declare_var_vector_reg job0_msg4, 24 + declare_var_vector_reg job1_msg4, 25 + declare_var_vector_reg job2_msg4, 26 + declare_var_vector_reg job3_msg4, 27 + declare_var_vector_reg job0_dig0, 8 + declare_var_vector_reg job1_dig0, 9 + declare_var_vector_reg job2_dig0, 10 + declare_var_vector_reg job3_dig0, 11 + declare_var_vector_reg job0_dig1, 12 + declare_var_vector_reg job1_dig1, 13 + declare_var_vector_reg job2_dig1, 14 + declare_var_vector_reg job3_dig1, 15 + + declare_var_vector_reg job0_backup_dig0, 24 + declare_var_vector_reg job1_backup_dig0, 25 + declare_var_vector_reg job2_backup_dig0, 26 + declare_var_vector_reg job3_backup_dig0, 27 + declare_var_vector_reg job0_backup_dig1, 28 + declare_var_vector_reg job1_backup_dig1, 29 + declare_var_vector_reg job2_backup_dig1, 30 + declare_var_vector_reg job3_backup_dig1, 31 + + declare_var_vector_reg const0, 24 + declare_var_vector_reg const1, 25 + declare_var_vector_reg const2, 26 + declare_var_vector_reg const3, 27 + declare_var_vector_reg const4, 28 + declare_var_vector_reg const5, 29 + declare_var_vector_reg const6, 30 + declare_var_vector_reg const7, 31 + declare_var_vector_reg const8, 24 + declare_var_vector_reg const9, 25 + declare_var_vector_reg const10, 26 + declare_var_vector_reg const11, 27 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm + +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm + +.macro rev32_msgs + .irp j,0,1,2,3 + do_rev32_job job\j + .endr +.endm + +.macro do_rev64 job,regd,regn + rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b +.endm + +.macro do_ldp_msg23 job + ldp qjob\job\()_msg2,qjob\job\()_msg3,[job\job\()_data],32 +.endm + + .global sm3_mb_sm_x4 + .type sm3_mb_sm_x4, %function +sm3_mb_sm_x4: + //push d8~d15 + sub sp,sp,temp_buf_size + stp d8,d9,[sp,-64]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + + ldr job0_data, [job0],64 + ldr job1_data, [job1],64 + ldr job2_data, [job2],64 + ldr job3_data, [job3],64 + + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + ldp qjob2_dig0,qjob2_dig1,[job2_digest] + ldp qjob3_dig0,qjob3_dig1,[job3_digest] + add end_ptr,job0_data,len,lsl 6 + //rev128,change digest endian + .irp j,0,1,2,3 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + .endr + + + + +start_loop: + add dig_buf,sp,dig_buf_off + ldp qjob0_msg0,qjob0_msg1,[job0_data],32 + add data_buf,sp,data_buf_off + ldp qjob1_msg0,qjob1_msg1,[job1_data],32 + st1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64 + ldp qjob2_msg0,qjob2_msg1,[job2_data],32 + st1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf] + ldp qjob3_msg0,qjob3_msg1,[job3_data],32 + + .irp j,0,1,2,3 + do_ldp_msg23 \j + do_rev32_msg job\j,msg0 + do_rev32_msg job\j,msg1 + .endr + st1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64 + st1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64 + .irp j,0,1,2,3 + do_rev32_msg job\j,msg2 + do_rev32_msg job\j,msg3 + .endr + st1 {vjob0_msg2.16b-vjob3_msg2.16b},[data_buf],64 + st1 {vjob0_msg3.16b-vjob3_msg3.16b},[data_buf],64 + + cmp job0_data,end_ptr + + /** message expand **/ + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + /** re-init variables for sm3 rounds **/ + add dig_buf,sp,dig_buf_off + ld1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64 + add data_buf,sp,data_buf_off + ld1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf] + add dig_buf,sp,dig_buf_off + adrp const_adr,.consts + ld1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64 + add const_adr,const_adr,:lo12:.consts + ld1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64 + ld1 {vconst0.16b-vconst3.16b},[const_adr],64 + ld1 {vconst4.16b-vconst7.16b},[const_adr],64 + /** digests rounds **/ + quad_round a, const0 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round a, const1 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round a, const2 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round a, const3 , dig0, dig1, msg1, msg0, tmp0, tmp1 + + /** share registers with vconst0-vconst3 **/ + ld1 {vconst8.16b-vconst11.16b},[const_adr] + + quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const8 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const9 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const10, dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const11, dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1,1 + + bcc start_loop + + //rev128 + .irp j,0,1,2,3 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + do_st_digest \j + .endr + + + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 64 + add sp,sp,temp_buf_size + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x4, .-sm3_mb_sm_x4 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c new file mode 100644 index 000000000..b1c6ee26b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c @@ -0,0 +1,284 @@ +/********************************************************************** + Copyright(c) 2011-2020 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR * state); + +void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SM3_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} + +void sm3_ctx_mgr_init_avx2(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_avx2(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_avx2(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_avx2(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + unsigned int j; + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + ctx->job.result_digest[j] = + byteswap32(ctx->job.result_digest[j]); + } + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + + 1 + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sm3_ctx_mgr_init_avx2_slver_0000; +struct slver sm3_ctx_mgr_init_avx2_slver = { 0x2309, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_avx2_slver_0000; +struct slver sm3_ctx_mgr_submit_avx2_slver = { 0x230a, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_avx2_slver_0000; +struct slver sm3_ctx_mgr_flush_avx2_slver = { 0x230b, 0x00, 0x00 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c new file mode 100644 index 000000000..8169aa170 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c @@ -0,0 +1,292 @@ +/********************************************************************** + Copyright(c) 2011-2020 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_avx512(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR * state); + +void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SM3_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} + +void sm3_ctx_mgr_init_avx512(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_avx512(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_avx512(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // if partial_block_buffer_length != 0 means ctx get extra data + // len < SM3_BLOCK_SIZE means data len < SM3_BLOCK_SIZE + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + + ctx->partial_block_buffer_length = 0; + ctx->job.buffer = ctx->partial_block_buffer; + + ctx->job.len = 1; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + unsigned int j; + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + ctx->job.result_digest[j] = + byteswap32(ctx->job.result_digest[j]); + } + return ctx; + } + // partial_block_buffer_length must be 0 that means incoming_buffer_length have not be init. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // copy_len will check len % SM3_BLOCK_SIZE ?= 0 + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + // if mod SM3_BLOCK_SIZE != 0 + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + // store the extra data + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + // after len -= copy_len or copy_len == 0 + assert((len % SM3_BLOCK_SIZE) == 0); + // get the block size , eq len = len / 64 + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + // todo make sure should return ? + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + + 1 + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_avx512(SM3_HASH_CTX_MGR * mgr) +{ + + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } + +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sm3_ctx_mgr_init_avx512_slver_0000; +struct slver sm3_ctx_mgr_init_avx512_slver = { 0x2306, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_avx512_slver_0000; +struct slver sm3_ctx_mgr_submit_avx512_slver = { 0x2307, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_avx512_slver_0000; +struct slver sm3_ctx_mgr_flush_avx512_slver = { 0x2308, 0x00, 0x00 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c new file mode 100644 index 000000000..e8fcfe08a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c @@ -0,0 +1,314 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define rol32(x, r) (((x)<<(r)) | ((x)>>(32-(r)))) + +static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t OPT_FIX sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void OPT_FIX sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sm3_single(const volatile void *data, uint32_t digest[]); +static inline void hash_init_digest(SM3_WORD_T * digest); + +static inline uint32_t P0(uint32_t X) +{ + return (X ^ (rol32(X, 9)) ^ (rol32(X, 17))); +} + +static inline uint32_t P1(uint32_t X) +{ + return (X ^ (rol32(X, 15)) ^ (rol32(X, 23))); +} + +static inline uint32_t sm3_ff(int j, uint32_t x, uint32_t y, uint32_t z) +{ + return j < 16 ? (x ^ y ^ z) : ((x & y) | (x & z) | (y & z)); +} + +static inline uint32_t sm3_gg(int j, uint32_t x, uint32_t y, uint32_t z) +{ + return j < 16 ? (x ^ y ^ z) : ((x & y) | ((~x) & z)); +} + +static inline void sm3_message_schedule(uint32_t bi[], volatile uint32_t W[], + volatile uint32_t W_B[]) +{ + int j; + volatile uint32_t tmp; + + for (j = 0; j <= 15; j++) { + W[j] = to_be32(bi[j]); + } + + for (; j <= 67; j++) { + tmp = W[j - 16] ^ W[j - 9] ^ rol32(W[j - 3], 15); + W[j] = P1(tmp) ^ (rol32(W[j - 13], 7)) ^ W[j - 6]; + } + + for (j = 0; j < 64; j++) { + W_B[j] = W[j] ^ W[j + 4]; + } + + tmp = 0; +} + +static inline void sm3_compress_step_func(int j, volatile uint32_t * a_p, + volatile uint32_t * b_p, volatile uint32_t * c_p, + volatile uint32_t * d_p, volatile uint32_t * e_p, + volatile uint32_t * f_p, volatile uint32_t * g_p, + volatile uint32_t * h_p, volatile uint32_t W[], + volatile uint32_t W_B[]) +{ + volatile uint32_t SS1, SS2, TT1, TT2; + uint32_t T = j < 16 ? 0x79cc4519 : 0x7a879d8a; + + SS1 = rol32(rol32(*a_p, 12) + *e_p + rol32(T, (j % 32)), 7); + SS2 = SS1 ^ rol32(*a_p, 12); + TT1 = sm3_ff(j, *a_p, *b_p, *c_p) + *d_p + SS2 + W_B[j]; + TT2 = sm3_gg(j, *e_p, *f_p, *g_p) + *h_p + SS1 + W[j]; + *d_p = *c_p; + *c_p = rol32(*b_p, 9); + *b_p = *a_p; + *a_p = TT1; + *h_p = *g_p; + *g_p = rol32(*f_p, 19); + *f_p = *e_p; + *e_p = P0(TT2); + + SS1 = 0; + SS2 = 0; + TT1 = 0; + TT2 = 0; +} + +void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr) +{ +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + if (len % SM3_BLOCK_SIZE != 0) { + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + sm3_init(ctx, buffer, len); + sm3_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + if (len % SM3_BLOCK_SIZE != 0) { + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + sm3_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sm3_update(ctx, buffer, len); + sm3_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sm3_init(ctx, buffer, len); + remain_len = sm3_update(ctx, buffer, len); + sm3_final(ctx, remain_len); + } + + return ctx; +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SM3_BLOCK_SIZE) { + sm3_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SM3_BLOCK_SIZE); + remain_len -= SM3_BLOCK_SIZE; + ctx->total_length += SM3_BLOCK_SIZE; + } + + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len; + uint32_t j; + volatile uint8_t buf[2 * SM3_BLOCK_SIZE] = { 0 }; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy((void *)buf, buffer, i); + buf[i++] = 0x80; + + i = (i > SM3_BLOCK_SIZE - SM3_PADLENGTHFIELD_SIZE ? + 2 * SM3_BLOCK_SIZE : SM3_BLOCK_SIZE); + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sm3_single(buf, digest); + if (i == 2 * SM3_BLOCK_SIZE) { + sm3_single(buf + SM3_BLOCK_SIZE, digest); + } + + /* convert to small-endian for words */ + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + digest[j] = byteswap32(digest[j]); + } + + ctx->status = HASH_CTX_STS_COMPLETE; + memset((void *)buf, 0, sizeof(buf)); +} + +static void sm3_single(const volatile void *data, uint32_t digest[]) +{ + volatile uint32_t a, b, c, d, e, f, g, h; + volatile uint32_t W[68], W_bar[64]; + int j; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + sm3_message_schedule((uint32_t *) data, W, W_bar); + for (j = 0; j < 64; j++) { + sm3_compress_step_func(j, &a, &b, &c, &d, &e, &f, &g, &h, W, W_bar); + } + + digest[0] ^= a; + digest[1] ^= b; + digest[2] ^= c; + digest[3] ^= d; + digest[4] ^= e; + digest[5] ^= f; + digest[6] ^= g; + digest[7] ^= h; + + memset((void *)W, 0, sizeof(W)); + memset((void *)W_bar, 0, sizeof(W_bar)); + + a = 0; + b = 0; + c = 0; + d = 0; + e = 0; + f = 0; + g = 0; + h = 0; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sm3_ctx_mgr_init_base_slver_0000; +struct slver sm3_ctx_mgr_init_base_slver = { 0x2303, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_base_slver_0000; +struct slver sm3_ctx_mgr_submit_base_slver = { 0x2304, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_base_slver_0000; +struct slver sm3_ctx_mgr_flush_base_slver = { 0x2305, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c new file mode 100644 index 000000000..d74a4c882 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include "sm3_mb.h" +#include "memcpy_inline.h" + +extern void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr); +extern SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); +extern SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr); + +void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr) +{ + return sm3_ctx_mgr_init_base(mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sm3_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr) +{ + return sm3_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm new file mode 100644 index 000000000..0f2a0f39a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm @@ -0,0 +1,65 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SM3_SB_THRESHOLD_SSE 1 +%define SM3_SB_THRESHOLD_AVX 1 +%define SM3_SB_THRESHOLD_AVX2 1 +%define SM3_SB_THRESHOLD_AVX512 1 +%define SM3_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb +%define SM3_NI_SB_THRESHOLD_AVX512 6 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SM3_JOB_size _FIELD_OFFSET +%assign _SM3_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c new file mode 100644 index 000000000..fbbb2a1a7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SM3_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +// Compare against reference function +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SM3_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SM3_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SM3_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sm3_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SM3_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sm3_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sm3_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf("Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm new file mode 100644 index 000000000..a2319ba14 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm @@ -0,0 +1,77 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SM3 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SM3_ARGS_X16 +;;; name size align +FIELD _digest, 4*8*16, 4 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SM3_ARGS_X4_size _FIELD_OFFSET +%assign _SM3_ARGS_X4_align _STRUCT_ALIGN +%assign _SM3_ARGS_X8_size _FIELD_OFFSET +%assign _SM3_ARGS_X8_align _STRUCT_ALIGN +%assign _SM3_ARGS_X16_size _FIELD_OFFSET +%assign _SM3_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SM3_ARGS_X4_size, _SM3_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..b87bdcba8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm @@ -0,0 +1,258 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "sm3_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sm3_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sm3_mb_x8_avx2 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SM3_JOB* sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sm3_mb_mgr_flush_avx2, function +sm3_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..7feada49f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm @@ -0,0 +1,276 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + + + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sm3_mb_x16_avx512 +;extern sm3_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg1 rdi ; rcx + %define arg2 rsi ; rdx + %define tmp4 rdx +%else + %define arg1 rcx + %define arg2 rdx + %define tmp4 rsi +%endif + + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + + +; SM3_JOB* sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sm3_mb_mgr_flush_avx512, function +sm3_mb_mgr_flush_avx512: + endbranch + + ; Save the stack + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; flush may check here and call x1 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x16_avx512 + ; state and idx are intact + + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + + +; return back stack +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_mgr_flush_avx512 +no_sm3_mb_mgr_flush_avx512: +%endif + +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..ae95faa89 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm @@ -0,0 +1,247 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "memcpy.asm" +%include "sm3_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sm3_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SM3_JOB* sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR *state, SM3_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sm3_mb_mgr_submit_avx2, function +sm3_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..7b7b21287 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm @@ -0,0 +1,273 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "memcpy.asm" +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +; +; SM3_JOB* sm3_mb_mgr_submit_avx512 (SM3_MB_JOB_MGR *state, SM3_JOB* job); +; + +%ifdef HAVE_AS_KNOWS_AVX512 + +;todo sm3_mb_x16_avx512 +extern sm3_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +%define arg1 rdi ; state +%define arg2 rsi ; job + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx ; state +%define arg2 rdx ; job + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 ; +%define len2 arg2 ; + offset +%define p2 arg2 ; need + offset + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 +%define num_lanes_inuse r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + +; todo make sure +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +mk_global sm3_mb_mgr_submit_avx512, function +sm3_mb_mgr_submit_avx512: + endbranch + + ; save these registers + sub rsp, STACK_SPACE + ; rsp contain stack ptr , mov to stack bottom + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp ; unuse 1 2 + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 + ;mov rbx,rbp,r12,r13,r14,r15 to stack +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + ; mov args to rbx and then mov rbx to rbp + ; unused_lanes - rbx , lane - rbp both have already backup + and lane, 0xF + ; unless lane is 0x789abcdef, and return 0 + + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + ; eq jump + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +; restore stack +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + + + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_mgr_submit_avx512 +no_sm3_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c new file mode 100644 index 000000000..b904ba0ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c @@ -0,0 +1,160 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + sm3_ossl(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c new file mode 100644 index 000000000..3671a3b79 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c @@ -0,0 +1,206 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +// Compare against reference function +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // Run sm3_mb test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sm3_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c new file mode 100644 index 000000000..64e583ffc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c @@ -0,0 +1,298 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SM3_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SM3_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sm3_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]); + } + + // Run sb_sm3 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sm3_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sm3_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sm3_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + } + + sm3_ctx_mgr_init(mgr); + + // Run sm3_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SM3_BLOCK_SIZE + + SM3_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SM3_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sm3_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sm3_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SM3_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sm3_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c new file mode 100644 index 000000000..c409530c7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sm3_mb.h" + +typedef struct { + const char *msg; + uint32_t resultDigest[SM3_DIGEST_NWORDS]; +} TestData; + +static TestData test_data[] = { + { + .msg = "abc", + .resultDigest = {0xf4f0c766, 0xd9edee62, 0x6bd4f2d1, 0xe2e410dc, + 0x87c46741, 0xa2f7f25c, 0x2ba07d29, 0xe0a84b8f} + }, + { + .msg = "abcdabcdabcdabcdabcdabcdabcdabcd" "abcdabcdabcdabcdabcdabcdabcdabcd", + .resultDigest = {0xf99fbede, 0xa1b87522, 0x89486038, 0x4d5a8ec1, + 0xe570db6f, 0x65577e38, 0xa3cb3d29, 0x32570c9c} + + }, + { + .msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", + .resultDigest = {0xc56c9b63, 0x379e4de6, 0x92b190a3, 0xeaa14fdf, + 0x74ab2007, 0xb992f67f, 0x664e8cf3, 0x058c7bad} + }, + + {.msg = "0123456789:;<=>?@ABCDEFGHIJKLMNO", + .resultDigest = {0x076833d0, 0xd089ec39, 0xad857685, 0x8089797a, + 0x9df9e8fd, 0x4126eb9a, 0xf38c22e8, 0x054bb846}}, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<", + .resultDigest = {0x6cb9d38e, 0x846ac99e, 0x6d05634b, 0x3fe1bb26, + 0x90368c4b, 0xee8c4299, 0x08c0e96a, 0x2233cdc7} + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR", + .resultDigest = {0x83758189, 0x050f14d1, 0x91d8a730, 0x4a2825e4, + 0x11723273, 0x2114ee3f, 0x18cac172, 0xa9c5b07a} + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?", + .resultDigest = {0xb80f8aba, 0x55e96119, 0x851ac77b, 0xae31b3a5, + 0x1333e764, 0xc86ac40d, 0x34878db1, 0x7da873f6}, + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU", + .resultDigest = {0xbd5736a7, 0x55977d13, 0xa950c78a, 0x71eeb7cb, + 0xe9ef0ba5, 0x95a9302e, 0x155e5c33, 0xad96ce3c} + }, + { + .msg = "", + .resultDigest = {0x831db21a, 0x7fa1cf55, 0x4819618e, 0x8f1ae831, + 0xc7c8be22, 0x74fbfe28, 0xeb35d07e, 0x2baa8250} + + }, + +}; + +#define MSGS sizeof(test_data)/sizeof(TestData) +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +int main(void) +{ + + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], test_data[i].msg, + strlen((char *)test_data[i].msg), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = test_data[t].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = test_data[t].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + test_data[j].msg, strlen((char *)test_data[j].msg), + HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = test_data[k].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = test_data[k].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sm3 test: Pass\n"); + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c new file mode 100644 index 000000000..ed4d9a092 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c @@ -0,0 +1,128 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sm3_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sm3_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sm3_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sm3" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sm3 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..025fd90ed --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,133 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sm3_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SM3_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sm3_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sm3_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sm3" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sm3 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm new file mode 100644 index 000000000..3b300fa80 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm @@ -0,0 +1,1035 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, 8*64, 64 +FIELD _rsp, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) ; rdi + 8*16 +%define DIGEST state ; rdi +%define SIZE num_blks ; rsi + +%define IDX var1 +%define TBL var2 + +%define APPEND(a,b) a %+ b + + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 + +; +; 4 ZMM for tmp data +; +%define TMP0 zmm8 +%define TMP1 zmm9 +%define TMP2 zmm10 +%define TMP3 zmm11 + +; +; Word W[] will be expand to array size 64 +; Word WB[] will be expand to array size 68 +; WB[j] : +; tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15); +; WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6]; +; W[j]: +; W[j] = WB[j] xor WB[j+4] +; +; so we used zmm12~31 20 numbers ZMM to keep WB +; it is because once we calc W[j] value, we need +; WB[j - 16] to WB[j + 4] , it is 20 WB number. +; +; And also we keep the lane into ZMM12~ZMM27 +; once we calc WB value, lane will not work +; +%define WB0 zmm12 +%define WB1 zmm13 +%define WB2 zmm14 +%define WB3 zmm15 +%define WB4 zmm16 +%define WB5 zmm17 +%define WB6 zmm18 +%define WB7 zmm19 + +%define WB8 zmm20 +%define WB9 zmm21 +%define WB10 zmm22 +%define WB11 zmm23 +%define WB12 zmm24 +%define WB13 zmm25 +%define WB14 zmm26 +%define WB15 zmm27 + +%define WB16 zmm28 +%define WB17 zmm29 +%define WB18 zmm30 +%define WB19 zmm31 + + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +; +; same as sha256 +; +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + + ;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + + +%macro ROTATE_ARGS 0 + %xdefine TMP_ D + %xdefine D C + %xdefine C B + %xdefine B A + %xdefine A TMP3 + %xdefine TMP3 TMP_ + + %xdefine TMP2_ H + %xdefine H G + %xdefine G F + %xdefine F E + %xdefine E TMP0 + %xdefine TMP0 TMP2_ +%endmacro + +; +; P() Save in TMP0 +; used TMP1 +%macro P 1 +%define %%A %1 + vprold TMP0,%%A,9 + vprold TMP1,%%A,17 + + vpternlogd TMP0,TMP1,%%A,0x96 + +%endmacro + +; +; P1() Save in TMP0 +; used TMP1 +%macro P1 1 +%define %%A %1 + + vprold TMP0,%%A,15 + vprold TMP1,%%A,23 + + vpternlogd TMP0,TMP1,%%A,0x96 +%endmacro + +; +; FF_16() Save in TMP0 +; +%macro FF_16 3 +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I < 16 return (X ^ Y ^ Z) + vmovups TMP0,%%X + vpternlogd TMP0,%%Y,%%Z,0x96 +%endmacro + + +; +; FF_64() Save in TMP0 +; used TMP1 +%macro FF_64 3 + +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I > 16 return (x & y) | (x & z) | (y & z) + ; Same as (x & y) | (z & (x | y)) + vporq TMP0,%%X,%%Y + vpandq TMP0,%%Z + vpandq TMP1,%%X,%%Y + vporq TMP0,TMP1 +%endmacro + + +; +; GG() Save in TMP0 +; used TMP1 +%macro GG_16 3 +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I < 16 return (x ^ y ^ z) + vmovups TMP0,%%X + vpternlogd TMP0,%%Y,%%Z,0x96 +%endmacro + +%macro GG_64 3 + +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + + ; I > 16 return (x & y) | ((~x) & z) + vpandq TMP0,%%X,%%Y + vpandnd TMP1,%%X,%%Z + vporq TMP0,TMP1 +%endmacro + +;; void sm3_mb_x16_avx512(SM3_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sm3_mb_x16_avx512) +sm3_mb_x16_avx512: + endbranch + + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _rsp], rax + + lea TBL, [TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] ; mov unsigned + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + vmovups F, [DIGEST + 5*64] + vmovups G, [DIGEST + 6*64] + vmovups H, [DIGEST + 7*64] + + xor IDX, IDX + +%assign cur_loop 0 +lloop: + ;; start message expand + ;; Transpose input data + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + ;; stored B(i) to W(1)...W(15) + ;; in zmm16....zmm31 + + vmovups WB0,[inp0+IDX] + vmovups WB1,[inp1+IDX] + vmovups WB2,[inp2+IDX] + vmovups WB3,[inp3+IDX] + vmovups WB4,[inp4+IDX] + vmovups WB5,[inp5+IDX] + vmovups WB6,[inp6+IDX] + vmovups WB7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups WB8, [inp0+IDX] + vmovups WB9, [inp1+IDX] + vmovups WB10,[inp2+IDX] + vmovups WB11,[inp3+IDX] + vmovups WB12,[inp4+IDX] + vmovups WB13,[inp5+IDX] + vmovups WB14,[inp6+IDX] + vmovups WB15,[inp7+IDX] + + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 64 + + ; flat shuffle + TRANSPOSE16 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, TMP0, TMP1 + + ; little endian to big endian + vmovdqa32 TMP0, [SHUF_MASK] + vpshufb WB0,TMP0 + vpshufb WB1,TMP0 + vpshufb WB2,TMP0 + vpshufb WB3,TMP0 + vpshufb WB4,TMP0 + vpshufb WB5,TMP0 + vpshufb WB6,TMP0 + vpshufb WB7,TMP0 + vpshufb WB8,TMP0 + vpshufb WB9,TMP0 + vpshufb WB10,TMP0 + vpshufb WB11,TMP0 + vpshufb WB12,TMP0 + vpshufb WB13,TMP0 + vpshufb WB14,TMP0 + vpshufb WB15,TMP0 + +%assign I 0 +%rep 12 + %assign J I+4 + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_16 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_16 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I),APPEND(WB,J) + vpaddd TMP3,TMP0 + + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + ;vprold B,9 + ;vprold F,19 + ;P TMP2 + ;ROTATE_ARGS + + %assign I (I+1) +%endrep + + +;tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15); +;WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6]; + +; round 12-16 here +%rep 4 + %assign J I+4 + + %assign J_3 J-3 + %assign J_16 J-16 + %assign J_9 J-9 + %assign J_13 J-13 + %assign J_6 J-6 + + ; clac WB(I+4) + vprold APPEND(WB,J),APPEND(WB,J_3),15 + vpxord APPEND(WB,J),APPEND(WB,J_16) + vpxord APPEND(WB,J),APPEND(WB,J_9) + + P1 APPEND(WB,J) + + vprold APPEND(WB,J),APPEND(WB,J_13),7 + vpxord APPEND(WB,J),TMP0 + vpxord APPEND(WB,J),APPEND(WB,J_6) + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_16 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_16 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I),APPEND(WB,J) + vpaddd TMP3,TMP0 + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + %assign I (I+1) +%endrep + +%rep 48 + %assign J (((I+4) % 20) + 20) + + %assign J_3 ((J-3) % 20) + %assign J_16 ((J-16) % 20) + %assign J_9 ((J-9) % 20) + %assign J_13 ((J-13) % 20) + %assign J_6 ((J-6) % 20) + + %assign I_20 (I % 20) + %assign J (((I+4) % 20)) + + vprold APPEND(WB,J),APPEND(WB,J_3),15 + vpxord APPEND(WB,J),APPEND(WB,J_16) + vpxord APPEND(WB,J),APPEND(WB,J_9) + + P1 APPEND(WB,J) + + vprold APPEND(WB,J),APPEND(WB,J_13),7 + vpxord APPEND(WB,J),TMP0 + vpxord APPEND(WB,J),APPEND(WB,J_6) + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_64 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I_20) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_64 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I_20),APPEND(WB,J) + vpaddd TMP3,TMP0 + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + %assign I (I+1) +%endrep + ; Xor old digest + vpxord A, A, [rsp + _DIGEST_SAVE + 64*0] + vpxord B, B, [rsp + _DIGEST_SAVE + 64*1] + vpxord C, C, [rsp + _DIGEST_SAVE + 64*2] + vpxord D, D, [rsp + _DIGEST_SAVE + 64*3] + vpxord E, E, [rsp + _DIGEST_SAVE + 64*4] + vpxord F, F, [rsp + _DIGEST_SAVE + 64*5] + vpxord G, G, [rsp + _DIGEST_SAVE + 64*6] + vpxord H, H, [rsp + _DIGEST_SAVE + 64*7] + + %assign cur_loop cur_loop+1 + sub SIZE, 1 + je last_loop + + jmp lloop + + +last_loop: + +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + ; Write out digest + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + vmovups [DIGEST + 5*64], F + vmovups [DIGEST + 6*64], G + vmovups [DIGEST + 7*64], H + + + mov rsp, [rsp + _rsp] + ret + + +section .data +align 64 +TABLE: + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + + + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_x16_avx512 +no_sm3_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm new file mode 100644 index 000000000..0c2c9cdee --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm @@ -0,0 +1,711 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SM3 using SSE-256 / AVX2 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15 +;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx + %define reg3 rsi + %define reg4 rdi +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 +%define SIZE INP_SIZE ; rsi + +%define IDX rax +%define TBL reg3 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 reg4 + +%define APPEND(a,b) a %+ b + +%define WB0 ymm0 +%define WB1 ymm1 +%define WB2 ymm2 +%define WB3 ymm3 +%define WB4 ymm4 +%define WB5 ymm5 +%define WB6 ymm6 +%define WB7 ymm7 +%define WB8 ymm8 +%define WB9 ymm9 +%define WB10 ymm10 +%define WB11 ymm11 +%define WB12 ymm12 +%define WB13 ymm13 +%define WB14 ymm14 +%define WB15 ymm15 + +%define WBTMP0 ymm8 +%define WBTMP1 ymm9 + +%define WBTMP2 ymm0 +%define WBTMP3 ymm1 + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 +%define F ymm5 +%define G ymm6 +%define H ymm7 + +%define TMP0 ymm8 +%define TMP1 ymm9 +%define TMP2 ymm10 + +; W(j) = WB(j) + WB(j+4) +; Keep WB(j) - W(j+4) to reduce momory read +%define Wj0 ymm11 +%define Wj1 ymm12 +%define Wj2 ymm13 +%define Wj3 ymm14 +%define Wj4 ymm15 + + +%define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register +%define PTR_SZ 8 +%define SM3_DIGEST_WORD_SIZE 4 +%define MAX_SM3_LANES 8 +%define NUM_SM3_DIGEST_WORDS 8 +%define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE) + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ8 + .digest resb 8*SZ8 + .wbtmp resb 69*SZ8 + .rsp resb 8 +endstruc +%define FRAMESZ stack_frame_size +%define _DIGEST stack_frame.digest +%define _WBTMP stack_frame.wbtmp +%define _RSP_SAVE stack_frame.rsp + +%define YTMP0 rsp + _WBTMP + 0*SZ8 +%define YTMP1 rsp + _WBTMP + 1*SZ8 +%define YTMP2 rsp + _WBTMP + 2*SZ8 +%define YTMP3 rsp + _WBTMP + 3*SZ8 +%define YTMP4 rsp + _WBTMP + 4*SZ8 + +%define YTMPI rsp + _WBTMP + I*SZ8 +%define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8 +%define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8 +%define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8 +%define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8 + + +%define VMOVPS vmovups + +;;;;;;;; +; same as sha256 +;;;;;;;; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + +%macro ROTATE_W 0 + + %xdefine TMP_ Wj0 + %xdefine Wj0 Wj1 + %xdefine Wj1 Wj2 + %xdefine Wj2 Wj3 + %xdefine Wj3 Wj4 + + %xdefine Wj4 TMP_ + +%endmacro + +; ROTATE A,B,C,D +%macro ROTATE_ARGS_AD 0 + + %xdefine TMP_ D + %xdefine D C + %xdefine C B + %xdefine B A + %xdefine A TMP2 + %xdefine TMP2 TMP_ + +%endmacro + +%macro ROTATE_ARGS_EH 0 + + %xdefine TMP_ H + %xdefine H G + %xdefine G F + %xdefine F E + %xdefine E TMP0 + %xdefine TMP0 TMP_ + +%endmacro + +%macro ROLD 3 + +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, %%imm + vpsrld %%reg, %%reg, (32-(%%imm)) + vpor %%reg, %%reg, %%tmp + +%endmacro + +%macro ROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, %%imm + vpsrld %%reg, %%src, (32-(%%imm)) + vpor %%reg, %%reg, %%tmp +%endmacro + +;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes); +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of input in blocks +mk_global sm3_mb_x8_avx2,function,internal +align 16 +sm3_mb_x8_avx2: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the YMM registers + + ; save rsp, allocate 32-byte aligned for local variables + mov IDX, rsp + sub rsp, FRAMESZ + and rsp, ~31 + mov [rsp + _RSP_SAVE], IDX + + lea TBL,[TABLE] + + ;; load the address of each of the 8 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ] + mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ] + mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ] + mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ] + mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ] + mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ] + mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ] + mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ] + + xor IDX, IDX + +%assign cur_loop 0 +lloop: + + ; + ; Pre calculate the WB 0..68 an W 0..64 + ; It will better than calculate WB/W in round method + ; + ; ps : SHA256(AVX2) calculate WB/W in round method + ; + ; Pre calculation memory io time: + ; read : 68 + 3 * 52(read WB) + ; write : 52(write WB17..68) + ; Round method calculation memory io time: + ; read : 48 * 6(read 6 number of WB each round) + ; write : 52 + 64(same as upper) + ; + VMOVPS WB0,[inp0+IDX] + VMOVPS WB1,[inp1+IDX] + VMOVPS WB2,[inp2+IDX] + VMOVPS WB3,[inp3+IDX] + VMOVPS WB4,[inp4+IDX] + VMOVPS WB5,[inp5+IDX] + VMOVPS WB6,[inp6+IDX] + VMOVPS WB7,[inp7+IDX] + + TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1 + vmovdqa WBTMP0, [SHUF_MASK] + vpshufb WB0,WBTMP0 + vpshufb WB1,WBTMP0 + vpshufb WB2,WBTMP0 + vpshufb WB3,WBTMP0 + vpshufb WB4,WBTMP0 + vpshufb WB5,WBTMP0 + vpshufb WB6,WBTMP0 + vpshufb WB7,WBTMP0 + + vmovdqa [YTMP0], WB0 + vmovdqa [YTMP1], WB1 + + VMOVPS WB8,[inp0+IDX + 32] + VMOVPS WB9,[inp1+IDX + 32] + VMOVPS WB10,[inp2+IDX + 32] + VMOVPS WB11,[inp3+IDX + 32] + VMOVPS WB12,[inp4+IDX + 32] + VMOVPS WB13,[inp5+IDX + 32] + VMOVPS WB14,[inp6+IDX + 32] + VMOVPS WB15,[inp7+IDX + 32] + + TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3 + vmovdqa WBTMP2, [SHUF_MASK] + vpshufb WB8,WBTMP2 + vpshufb WB9,WBTMP2 + vpshufb WB10,WBTMP2 + vpshufb WB11,WBTMP2 + vpshufb WB12,WBTMP2 + vpshufb WB13,WBTMP2 + vpshufb WB14,WBTMP2 + vpshufb WB15,WBTMP2 + +; WB0 WB1 already saved +%assign I 2 +%rep 14 + vmovdqa [YTMPI], APPEND(WB,I) +%assign I (I+1) +%endrep + + vmovdqa WB0 , [YTMP0] + vmovdqa WB1 , [YTMP1] + +; Calculate WB 16...67 +%rep 52 + %assign J (I % 16) + %assign J_1 ((I-1) % 16) ;tmp to use + %assign J_2 ((I-2) % 16) ;tmp to use + %assign J_3 ((I-3) % 16) + %assign J_4 ((I-4) % 16) ;tmp to use + %assign J_9 ((I-9) % 16) + %assign J_13 ((I-13) % 16) + %assign J_6 ((I-6) % 16) + + ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_9) + + ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J) + ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_1) + + ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_6) + + vmovdqa [YTMPI], APPEND(WB,J) + + vmovdqa APPEND(WB,J_1), [YTMPI_1] + vmovdqa APPEND(WB,J_2), [YTMPI_2] + vmovdqa APPEND(WB,J_4), [YTMPI_4] + + %assign I (I+1) +%endrep + + add IDX, 4*4*4 + + ; Every round need load A-H + ; Because we pre calculate the WB + vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE] + vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE] + vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE] + vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE] + vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE] + vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE] + vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE] + vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE] + + vmovdqa Wj0, [YTMP0] + vmovdqa Wj1, [YTMP1] + vmovdqa Wj2, [YTMP2] + vmovdqa Wj3, [YTMP3] + vmovdqa Wj4, [YTMP4] + + +%assign I 0 +%rep 16 + + ; SS1 - TMP1 + ROLD_nd TMP0,12,TMP1,A + vmovdqa TMP1, [TBL + (I*32)] + vpaddd TMP1,E + vpaddd TMP1,TMP0 + ROLD TMP1,7,TMP2 + + ; SS2 - TMP2 + vpxor TMP2,TMP1,TMP0 + + ; TT1 + vpxor TMP0,A,B + vpxor TMP0,C + vpaddd TMP2,TMP0 + vpaddd TMP2,D + vpxor TMP0,Wj0,Wj4 + vpaddd TMP2,TMP0 + + ROLD B,9,TMP0 + + ; Rotate a,b,c,d first + ; after P0(TT2) , Wj0 will be relase + ROTATE_ARGS_AD + + ; P0(TT2) + vpxor TMP0,E,F + vpxor TMP0,G + vpaddd TMP0,H + vpaddd TMP0,TMP1 + vpaddd TMP0,Wj0 + + ROLD_nd TMP1,9,TMP2,TMP0 + ROLD_nd Wj0,17,TMP2,TMP0 + + vpxor TMP0,TMP1 + vpxor TMP0,Wj0 + + ROLD F,19,TMP2 + + ROTATE_ARGS_EH + + ROTATE_W + + vmovdqa Wj4, [YTMPI5] + %assign I (I+1) +%endrep + +%rep 48 + ; SS1 - TMP1 + ROLD_nd TMP0,12,TMP1,A + vmovdqa TMP1, [TBL + (I*32)] + vpaddd TMP1,E + vpaddd TMP1,TMP0 + ROLD TMP1,7,TMP2 + + ; SS2 - TMP2 + vpxor TMP2,TMP1,TMP0 + + ; SS2 + D first + ; D will be release + ; FF16/GG16 diff with FF64/GG64 + ; So the register which keep D should be release before calculate TT1 + vpaddd TMP2,D + + ; TT1 + vpor TMP0,A,B + vpand TMP0,C + vpand D,A,B + vpor TMP0,D + + vpaddd TMP2,TMP0 + vpxor TMP0,Wj0,Wj4 + vpaddd TMP2,TMP0 + + ROLD B,9,TMP0 + + ROTATE_ARGS_AD + + ; P0(TT2) + vpaddd TMP1,H + vpaddd TMP1,Wj0 + + vpand TMP0,E,F + vpandn Wj0,E,G + vpor TMP0,Wj0 + + vpaddd TMP0,TMP1 + + ROLD_nd TMP1,9,TMP2,TMP0 + ROLD_nd Wj0,17,TMP2,TMP0 + + vpxor TMP0,TMP1 + vpxor TMP0,Wj0 + + ROLD F,19,TMP2 + + ROTATE_ARGS_EH + + ROTATE_W + vmovdqa Wj4, [YTMPI5] + %assign I (I+1) +%endrep + + vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE] + vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE] + vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE] + vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE] + vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE] + vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE] + vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE] + vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE] + + ; Write back to memory (state object) the transposed digest + vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A + vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B + vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C + vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D + vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E + vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F + vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G + vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H + + sub SIZE, 1 + je last_loop + jmp lloop + +last_loop: + + + ; update input pointers + add inp0, IDX + mov [STATE + _args_data_ptr + 0*8], inp0 + add inp1, IDX + mov [STATE + _args_data_ptr + 1*8], inp1 + add inp2, IDX + mov [STATE + _args_data_ptr + 2*8], inp2 + add inp3, IDX + mov [STATE + _args_data_ptr + 3*8], inp3 + add inp4, IDX + mov [STATE + _args_data_ptr + 4*8], inp4 + add inp5, IDX + mov [STATE + _args_data_ptr + 5*8], inp5 + add inp6, IDX + mov [STATE + _args_data_ptr + 6*8], inp6 + add inp7, IDX + mov [STATE + _args_data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, [rsp + _RSP_SAVE] + ret + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +align 64 +global TABLE +TABLE: + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + +SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm new file mode 100644 index 000000000..482876539 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm @@ -0,0 +1,81 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +extern sm3_ctx_mgr_init_base +extern sm3_ctx_mgr_submit_base +extern sm3_ctx_mgr_flush_base + +extern sm3_ctx_mgr_init_avx2 +extern sm3_ctx_mgr_submit_avx2 +extern sm3_ctx_mgr_flush_avx2 + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sm3_ctx_mgr_init_avx512 + extern sm3_ctx_mgr_submit_avx512 + extern sm3_ctx_mgr_flush_avx512 +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sm3_ctx_mgr_init +mbin_interface sm3_ctx_mgr_submit +mbin_interface sm3_ctx_mgr_flush + +;; have not imlement see/avx yet +%ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \ + sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_avx2, \ + sm3_ctx_mgr_init_avx512 + mbin_dispatch_init6 sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \ + sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_avx2, \ + sm3_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \ + sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_avx2, \ + sm3_ctx_mgr_flush_avx512 +%else + mbin_dispatch_init sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \ + sm3_ctx_mgr_init_base,sm3_ctx_mgr_init_avx2 + mbin_dispatch_init sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \ + sm3_ctx_mgr_submit_base,sm3_ctx_mgr_submit_avx2 + mbin_dispatch_init sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \ + sm3_ctx_mgr_flush_base,sm3_ctx_mgr_flush_avx2 +%endif + +;;; func core, ver, snum +slversion sm3_ctx_mgr_init, 00, 00, 2300 +slversion sm3_ctx_mgr_submit, 00, 00, 2301 +slversion sm3_ctx_mgr_flush, 00, 00, 2302 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c new file mode 100644 index 000000000..be56350b3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c @@ -0,0 +1,207 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" + +typedef uint32_t digest_sm3[SM3_DIGEST_NWORDS]; + +#define MSGS 2 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "abc"; +static uint8_t msg2[] = "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd"; + +/* small endian */ +static digest_sm3 exp_result_digest1 = { 0x66c7f0f4, 0x62eeedd9, 0xd1f2d46b, 0xdc10e4e2, + 0x4167c487, 0x5cf2f7a2, 0x297da02b, 0x8f4ba8e0 +}; + +/* small endian */ +static digest_sm3 exp_result_digest2 = { 0xdebe9ff9, 0x2275b8a1, 0x38604889, 0xc18e5a4d, + 0x6fdb70e5, 0x387e5765, 0x293dcba3, 0x9c0c5732 +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2 }; + +static uint32_t *exp_result_digest[MSGS] = { + exp_result_digest1, exp_result_digest2 +}; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = exp_result_digest[t]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = exp_result_digest[t]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = exp_result_digest[k]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } + } + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = exp_result_digest[k]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sm3 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c new file mode 100644 index 000000000..4c0c54436 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c @@ -0,0 +1,45 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include + +void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest) +{ + EVP_MD_CTX *md_ctx; + const EVP_MD *md; + unsigned int md_len; + + md = EVP_sm3(); + md_ctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(md_ctx, md, NULL); + EVP_DigestUpdate(md_ctx, buf, length); + EVP_DigestFinal_ex(md_ctx, digest, &md_len); + EVP_MD_CTX_free(md_ctx); +} diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile new file mode 100644 index 000000000..964baee11 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile @@ -0,0 +1,19 @@ +CFLAGS += -I ../../include +libs += ../../bin/isa-l_crypto.a +tests = $(patsubst %test.c, %test, $(wildcard *_test.c)) + +tests: $(tests) +$(tests): $(libs) +%test: %test.c + $(CC) $< $(libs) $(CFLAGS) $(LDLIBS) -o $@ +$(libs): + $(MAKE) -C ../../ -f Makefile.unx +test: $(addsuffix .run,$(tests)) + @echo ALL PASS +$(addsuffix .run,$(tests)): %.run: % + $(SIM) ./$< + @echo Completed run: $< +clean: + $(RM) *.o $(tests) + +$(tests): LDLIBS += -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake new file mode 100644 index 000000000..daaf04e79 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake @@ -0,0 +1,58 @@ +######################################################################## +# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +tests = md5_mb_over_4GB_test.exe sha1_mb_over_4GB_test.exe \ +sha256_mb_over_4GB_test.exe sha512_mb_over_4GB_test.exe + +INCLUDES = -I../../include +LINKFLAGS = /nologo +INCLUDES = $(INCLUDES) -Ic:\OpenSSL-Win64\include +CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) /c +LINKFLAGS = $(LINKFLAGS) /libpath:c:\OpenSSL-Win64\lib +LIBS = ../../isa-l_crypto.lib +DLL = isa-l_crypto.dll + +tests: lib $(tests) +$(tests): $(@B).obj + link /out:$@ $(LINKFLAGS) libeay32.lib $(LIBS) $*.obj +%.obj: %.c + $(CC) $(CFLAGS) -Fo$@ $? + +lib: + cd ../../ && nmake -f Makefile.nmake + cd ../../ && copy $(DLL) "tests\extended" + +test: $(tests) + !$? + echo ALL PASS + +clean: + -if exist *.obj del *.obj + -if exist *.exe del *.exe + -if exist *.dll del *.dll diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c new file mode 100644 index 000000000..5eb7be75f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c @@ -0,0 +1,155 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "md5_mb.h" +#include "endian_helper.h" +#include +#define TEST_LEN (1024*1024ull) //1M +#define TEST_BUFS MD5_MIN_LANES +#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES +#define UPDATE_SIZE (13*MD5_BLOCK_SIZE) +#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES) + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref_upd[4 * MD5_DIGEST_NWORDS]; + +struct user_data { + int idx; + uint64_t processed; +}; + +int main(void) +{ + MD5_CTX o_ctx; //openSSL + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, k, fail = 0; + unsigned char *bufs[TEST_BUFS]; + struct user_data udata[TEST_BUFS]; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + md5_ctx_mgr_init(mgr); + + printf("md5_large_test\n"); + + // Init ctx contents + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)&udata[i]; + } + + //Openssl MD5 update test + MD5_Init(&o_ctx); + for (k = 0; k < ROTATION_TIMES; k++) { + MD5_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN); + } + MD5_Final(digest_ref_upd, &o_ctx); + + // Initialize pool + for (i = 0; i < TEST_BUFS; i++) { + struct user_data *u = (struct user_data *)ctxpool[i].user_data; + u->idx = i; + u->processed = 0; + } + + printf("Starting updates\n"); + int highest_pool_idx = 0; + ctx = &ctxpool[highest_pool_idx++]; + while (ctx) { + int len = UPDATE_SIZE; + int update_type = HASH_UPDATE; + struct user_data *u = (struct user_data *)ctx->user_data; + int idx = u->idx; + + if (u->processed == 0) + update_type = HASH_FIRST; + + else if (hash_ctx_complete(ctx)) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = md5_ctx_mgr_flush(mgr); + continue; + } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) { + len = (LEN_TOTAL - u->processed); + update_type = HASH_LAST; + } + u->processed += len; + ctx = md5_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type); + + if (NULL == ctx) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = md5_ctx_mgr_flush(mgr); + } + } + + printf("multibuffer md5 digest: \n"); + for (i = 0; i < TEST_BUFS; i++) { + printf("Total processing size of buf[%d] is %ld \n", i, + ctxpool[i].total_length); + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]); + } + } + printf("\n"); + + printf("openssl md5 update digest: \n"); + for (i = 0; i < MD5_DIGEST_NWORDS; i++) + printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i])); + printf("\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref_upd)[j])) { + fail++; + } + } + } + + if (fail) + printf("Test failed md5 hash large file check %d\n", fail); + else + printf(" md5_hash_large_test: Pass\n"); + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c new file mode 100644 index 000000000..af94a8098 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c @@ -0,0 +1,156 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha1_mb.h" +#include "endian_helper.h" +#include +#define TEST_LEN (1024*1024ull) //1M +#define TEST_BUFS SHA1_MIN_LANES +#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES +#define UPDATE_SIZE (13*SHA1_BLOCK_SIZE) +#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES) + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref_upd[4 * SHA1_DIGEST_NWORDS]; + +struct user_data { + int idx; + uint64_t processed; +}; + +int main(void) +{ + SHA_CTX o_ctx; //openSSL + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, k, fail = 0; + unsigned char *bufs[TEST_BUFS]; + struct user_data udata[TEST_BUFS]; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + printf("sha1_large_test\n"); + + // Init ctx contents + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)&udata[i]; + } + + //Openssl SHA1 update test + SHA1_Init(&o_ctx); + for (k = 0; k < ROTATION_TIMES; k++) { + SHA1_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN); + } + SHA1_Final(digest_ref_upd, &o_ctx); + + // Initialize pool + for (i = 0; i < TEST_BUFS; i++) { + struct user_data *u = (struct user_data *)ctxpool[i].user_data; + u->idx = i; + u->processed = 0; + } + + printf("Starting updates\n"); + int highest_pool_idx = 0; + ctx = &ctxpool[highest_pool_idx++]; + while (ctx) { + int len = UPDATE_SIZE; + int update_type = HASH_UPDATE; + struct user_data *u = (struct user_data *)ctx->user_data; + int idx = u->idx; + + if (u->processed == 0) + update_type = HASH_FIRST; + + else if (hash_ctx_complete(ctx)) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha1_ctx_mgr_flush(mgr); + continue; + } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) { + len = (LEN_TOTAL - u->processed); + update_type = HASH_LAST; + } + u->processed += len; + ctx = sha1_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type); + + if (NULL == ctx) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha1_ctx_mgr_flush(mgr); + } + } + + printf("multibuffer SHA1 digest: \n"); + for (i = 0; i < TEST_BUFS; i++) { + printf("Total processing size of buf[%d] is %ld \n", i, + ctxpool[i].total_length); + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]); + } + } + printf("\n"); + + printf("openssl SHA1 update digest: \n"); + for (i = 0; i < SHA1_DIGEST_NWORDS; i++) + printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i])); + printf("\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ref_upd)[j])) { + fail++; + } + } + } + + if (fail) + printf("Test failed SHA1 hash large file check %d\n", fail); + else + printf(" SHA1_hash_large_test: Pass\n"); + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c new file mode 100644 index 000000000..35bbdcbae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c @@ -0,0 +1,156 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha256_mb.h" +#include "endian_helper.h" +#include +#define TEST_LEN (1024*1024ull) //1M +#define TEST_BUFS SHA256_MIN_LANES +#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES +#define UPDATE_SIZE (13*SHA256_BLOCK_SIZE) +#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES) + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref_upd[4 * SHA256_DIGEST_NWORDS]; + +struct user_data { + int idx; + uint64_t processed; +}; + +int main(void) +{ + SHA256_CTX o_ctx; //openSSL + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, k, fail = 0; + unsigned char *bufs[TEST_BUFS]; + struct user_data udata[TEST_BUFS]; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + printf("sha256_large_test\n"); + + // Init ctx contents + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)&udata[i]; + } + + //Openssl SHA256 update test + SHA256_Init(&o_ctx); + for (k = 0; k < ROTATION_TIMES; k++) { + SHA256_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN); + } + SHA256_Final(digest_ref_upd, &o_ctx); + + // Initialize pool + for (i = 0; i < TEST_BUFS; i++) { + struct user_data *u = (struct user_data *)ctxpool[i].user_data; + u->idx = i; + u->processed = 0; + } + + printf("Starting updates\n"); + int highest_pool_idx = 0; + ctx = &ctxpool[highest_pool_idx++]; + while (ctx) { + int len = UPDATE_SIZE; + int update_type = HASH_UPDATE; + struct user_data *u = (struct user_data *)ctx->user_data; + int idx = u->idx; + + if (u->processed == 0) + update_type = HASH_FIRST; + + else if (hash_ctx_complete(ctx)) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha256_ctx_mgr_flush(mgr); + continue; + } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) { + len = (LEN_TOTAL - u->processed); + update_type = HASH_LAST; + } + u->processed += len; + ctx = sha256_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type); + + if (NULL == ctx) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha256_ctx_mgr_flush(mgr); + } + } + + printf("multibuffer SHA256 digest: \n"); + for (i = 0; i < TEST_BUFS; i++) { + printf("Total processing size of buf[%d] is %ld \n", i, + ctxpool[i].total_length); + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]); + } + } + printf("\n"); + + printf("openssl SHA256 update digest: \n"); + for (i = 0; i < SHA256_DIGEST_NWORDS; i++) + printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i])); + printf("\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ref_upd)[j])) { + fail++; + } + } + } + + if (fail) + printf("Test failed SHA256 hash large file check %d\n", fail); + else + printf(" SHA256_hash_large_test: Pass\n"); + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c new file mode 100644 index 000000000..9c2aeaead --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c @@ -0,0 +1,156 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" +#include "endian_helper.h" +#include +#define TEST_LEN (1024*1024ull) //1M +#define TEST_BUFS SHA512_MIN_LANES +#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES +#define UPDATE_SIZE (13*SHA512_BLOCK_SIZE) +#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES) + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref_upd[8 * SHA512_DIGEST_NWORDS]; + +struct user_data { + int idx; + uint64_t processed; +}; + +int main(void) +{ + SHA512_CTX o_ctx; //openSSL + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, k, fail = 0; + unsigned char *bufs[TEST_BUFS]; + struct user_data udata[TEST_BUFS]; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha512_ctx_mgr_init(mgr); + + printf("sha512_large_test\n"); + + // Init ctx contents + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)&udata[i]; + } + + //Openssl SHA512 update test + SHA512_Init(&o_ctx); + for (k = 0; k < ROTATION_TIMES; k++) { + SHA512_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN); + } + SHA512_Final(digest_ref_upd, &o_ctx); + + // Initialize pool + for (i = 0; i < TEST_BUFS; i++) { + struct user_data *u = (struct user_data *)ctxpool[i].user_data; + u->idx = i; + u->processed = 0; + } + + printf("Starting updates\n"); + int highest_pool_idx = 0; + ctx = &ctxpool[highest_pool_idx++]; + while (ctx) { + int len = UPDATE_SIZE; + int update_type = HASH_UPDATE; + struct user_data *u = (struct user_data *)ctx->user_data; + int idx = u->idx; + + if (u->processed == 0) + update_type = HASH_FIRST; + + else if (hash_ctx_complete(ctx)) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha512_ctx_mgr_flush(mgr); + continue; + } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) { + len = (LEN_TOTAL - u->processed); + update_type = HASH_LAST; + } + u->processed += len; + ctx = sha512_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type); + + if (NULL == ctx) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sha512_ctx_mgr_flush(mgr); + } + } + + printf("multibuffer sha512 digest: \n"); + for (i = 0; i < TEST_BUFS; i++) { + printf("Total processing size of buf[%d] is %ld \n", i, + ctxpool[i].total_length); + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + printf("digest%d : %016lX\n", j, ctxpool[i].job.result_digest[j]); + } + } + printf("\n"); + + printf("openssl sha512 update digest: \n"); + for (i = 0; i < SHA512_DIGEST_NWORDS; i++) + printf("%016lX - ", to_be64(((uint64_t *) digest_ref_upd)[i])); + printf("\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be64(((uint64_t *) digest_ref_upd)[j])) { + fail++; + } + } + } + + if (fail) + printf("Test failed sha512 hash large file check %d\n", fail); + else + printf(" sha512_hash_large_test: Pass\n"); + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c new file mode 100644 index 000000000..ea98e29b5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c @@ -0,0 +1,162 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sm3_mb.h" +#include "endian_helper.h" +#include + +#define TEST_LEN (1024*1024ull) //1M +#define TEST_BUFS SM3_MAX_LANES +#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES +#define UPDATE_SIZE (13*SM3_BLOCK_SIZE) +#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES) + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref_upd[4 * SM3_DIGEST_NWORDS]; + +struct user_data { + int idx; + uint64_t processed; +}; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, k, fail = 0; + unsigned char *bufs[TEST_BUFS]; + struct user_data udata[TEST_BUFS]; + EVP_MD_CTX *md_ctx; + const EVP_MD *md; + unsigned int md_len; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + printf("sm3_large_test\n"); + + // Init ctx contents + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)&udata[i]; + } + + //Openssl SM3 update test + md = EVP_sm3(); + md_ctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(md_ctx, md, NULL); + for (k = 0; k < ROTATION_TIMES; k++) { + EVP_DigestUpdate(md_ctx, bufs[k % TEST_BUFS], TEST_LEN); + } + EVP_DigestFinal_ex(md_ctx, digest_ref_upd, &md_len); + EVP_MD_CTX_free(md_ctx); + + // Initialize pool + for (i = 0; i < TEST_BUFS; i++) { + struct user_data *u = (struct user_data *)ctxpool[i].user_data; + u->idx = i; + u->processed = 0; + } + + printf("Starting updates\n"); + int highest_pool_idx = 0; + ctx = &ctxpool[highest_pool_idx++]; + while (ctx) { + int len = UPDATE_SIZE; + int update_type = HASH_UPDATE; + struct user_data *u = (struct user_data *)ctx->user_data; + int idx = u->idx; + + if (u->processed == 0) + update_type = HASH_FIRST; + + else if (hash_ctx_complete(ctx)) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sm3_ctx_mgr_flush(mgr); + continue; + } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) { + len = (LEN_TOTAL - u->processed); + update_type = HASH_LAST; + } + u->processed += len; + ctx = sm3_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type); + + if (NULL == ctx) { + if (highest_pool_idx < TEST_BUFS) + ctx = &ctxpool[highest_pool_idx++]; + else + ctx = sm3_ctx_mgr_flush(mgr); + } + } + + printf("multibuffer SM3 digest: \n"); + for (i = 0; i < TEST_BUFS; i++) { + printf("Total processing size of buf[%d] is %ld \n", i, + ctxpool[i].total_length); + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]); + } + } + printf("\n"); + + printf("openssl SM3 update digest: \n"); + for (i = 0; i < SM3_DIGEST_NWORDS; i++) + printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i])); + printf("\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref_upd)[j])) { + fail++; + } + } + } + + if (fail) + printf("Test failed SM3_hash_large check %d\n", fail); + else + printf(" SM3_hash_large_test: Pass\n"); + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh new file mode 100755 index 000000000..8c67a931c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +set -e +rc=0 +verbose=0 +indent_args='-linux -l95 -cp1 -lps -il6 -ncs' +function iver { printf "%03d%03d%03d%03d" $(echo "$@" | sed 's/GNU indent//' | tr '.' ' '); } + +while [ -n "$*" ]; do + case "$1" in + -v ) + verbose=1 + shift + ;; + -h ) + echo check_format.sh [-h -v] + exit 0 + ;; + esac +done + +echo "Checking format of files in the git index at $PWD" +if ! git rev-parse --is-inside-work-tree >& /dev/null; then + echo "Not in a git repo: Fail" + exit 1 +fi + +if hash indent && [ $(iver $(indent --version)) -ge $(iver 2.2.12) ]; then + echo "Checking C files for coding style..." + for f in `git ls-files '*.c'`; do + [ "$verbose" -gt 0 ] && echo "checking style on $f" + if ! indent $indent_args -st $f | diff -q $f - >& /dev/null; then + echo " File found with formatting issues: $f" + [ "$verbose" -gt 0 ] 2> /dev/null && indent $indent_args -st $f | diff -u $f - + rc=1 + fi + done + [ "$rc" -gt 0 ] && echo " Run ./tools/iindent on files" +else + echo "You do not have a recent indent installed so your code style is not being checked!" +fi + +if hash grep; then + echo "Checking for dos and whitespace violations..." + for f in $(git ls-files); do + [ "$verbose" -gt 0 ] && echo "checking whitespace on $f" + if grep -q '[[:space:]]$' $f ; then + echo " File found with trailing whitespace: $f" + rc=1 + fi + if grep -q $'\r' $f ; then + echo " File found with dos formatting: $f" + rc=1 + fi + done +fi + +echo "Checking source files for permissions..." +while read -r perm _res0 _res1 f; do + [ -z "$f" ] && continue + [ "$verbose" -gt 0 ] && echo "checking permissions on $f" + if [ "$perm" -ne 100644 ]; then + echo " File found with permissions issue ($perm): $f" + rc=1 + fi +done <<< $(git ls-files -s -- ':(exclude)*.sh' ':(exclude)*iindent') + +echo "Checking script files for permissions..." +while read -r perm _res0 _res1 f; do + [ -z "$f" ] && continue + [ "$verbose" -gt 0 ] && echo "checking permissions on $f" + if [ "$perm" -ne 100755 ]; then + echo " Script found with permissions issue ($perm): $f" + rc=1 + fi +done <<< $(git ls-files -s '*.sh') + + +echo "Checking for signoff in commit message..." +if ! git log -n 1 --format=%B | grep -q "^Signed-off-by:" ; then + echo " Commit not signed off. Please read src/CONTRIBUTING.md" + rc=1 +fi + +[ "$rc" -gt 0 ] && echo Format Fail || echo Format Pass + +exit $rc diff --git a/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk new file mode 100644 index 000000000..f2c8b46ed --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk @@ -0,0 +1,123 @@ +# Regenerate nmake file from makefiles or check its consistency + +test_nmake_file: tst.nmake + @diff -u Makefile.nmake tst.nmake || (echo Potential nmake consistency issue; $(RM) tst.nmake; false;) + @echo No nmake consistency issues + @$(RM) tst.nmake + +FORCE: +Makefile.nmake tst.nmake: FORCE + @echo Regenerating $@ + @echo '########################################################################' > $@ + @cat LICENSE | sed -e 's/^/#/ ' >> $@ + @echo '########################################################################' >> $@ + @echo '' >> $@ + @echo '# This file can be auto-regenerated with $$make -f Makefile.unx Makefile.nmake' >> $@ + @echo '' >> $@ + @echo -n 'objs =' >> $@ + @$(foreach o, $(subst /,\\,$(objs:.o=.obj)), printf " %s\n\t%s" \\ $(o) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'INCLUDES = $(INCLUDE)' >> $@ + @echo '# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level' >> $@ + @echo 'FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI' >> $@ + @echo 'CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy' >> $@ + @echo 'CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd' >> $@ + @echo 'LINKFLAGS = -nologo -incremental:no -debug' >> $@ + @echo 'CFLAGS = $$(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $$(FEAT_FLAGS) $$(INCLUDES) $$(D)' >> $@ + @echo 'AFLAGS = -f win64 $$(FEAT_FLAGS) $$(INCLUDES) $$(D)' >> $@ + @echo 'CC = cl' >> $@ + @echo '# or CC = icl -Qstd=c99' >> $@ + @echo 'AS = nasm' >> $@ + @echo '' >> $@ + @echo 'lib: bin static dll' >> $@ + @echo 'static: bin isa-l_crypto_static.lib' >> $@ + @echo 'dll: bin isa-l_crypto.dll' >> $@ + @echo '' >> $@ + @echo 'bin: ; -mkdir $$@' >> $@ + @echo '' >> $@ + @echo 'isa-l_crypto_static.lib: $$(objs)' >> $@ + @echo ' lib -out:$$@ @<<' >> $@ + @echo '$$?' >> $@ + @echo '<<' >> $@ + @echo '' >> $@ + @echo 'isa-l_crypto.dll: $$(objs)' >> $@ + @echo ' link -out:$$@ -dll -def:isa-l_crypto.def $$(LINKFLAGS) @<<' >> $@ + @echo '$$?' >> $@ + @echo '<<' >> $@ + @echo '' >> $@ + @$(foreach b, $(units), \ + printf "{%s}.c.obj:\n\t\$$(CC) \$$(CFLAGS) /c -Fo\$$@ \$$?\n{%s}.asm.obj:\n\t\$$(AS) \$$(AFLAGS) -o \$$@ \$$?\n\n" $(b) $(b) >> $@; ) + @echo '' >> $@ +ifneq (,$(examples)) + @echo "# Examples" >> $@ + @echo -n 'ex =' >> $@ + @$(foreach ex, $(notdir $(examples)), printf " %s\n\t%s.exe" \\ $(ex) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'ex: lib $$(ex)' >> $@ + @echo '' >> $@ + @echo '$$(ex): $$(@B).obj' >> $@ +endif + @echo '' >> $@ + @echo '.obj.exe:' >> $@ + @echo ' link /out:$$@ $$(LINKFLAGS) isa-l_crypto.lib $$?' >> $@ + @echo '' >> $@ + @echo '# Check tests' >> $@ + @echo -n 'checks =' >> $@ + @$(foreach check, $(notdir $(check_tests)), printf " %s\n\t%s.exe" \\ $(check) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'checks: lib $$(checks)' >> $@ + @echo '$$(checks): $$(@B).obj' >> $@ + @echo 'check: $$(checks)' >> $@ + @echo ' !$$?' >> $@ + @echo '' >> $@ + @echo '# Unit tests' >> $@ + @echo -n 'tests =' >> $@ + @$(foreach test, $(notdir $(unit_tests)), printf " %s\n\t%s.exe" \\ $(test) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'tests: lib $$(tests)' >> $@ + @echo '$$(tests): $$(@B).obj' >> $@ + @echo '' >> $@ + @echo '# Performance tests' >> $@ + @echo -n 'perfs =' >> $@ + @$(foreach perf, $(notdir $(perf_tests)), printf " %s\n\t%s.exe" \\ $(perf) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'perfs: lib $$(perfs)' >> $@ + @echo '$$(perfs): $$(@B).obj' >> $@ + @echo '' >> $@ + @echo -n 'progs =' >> $@ + @$(foreach prog, $(notdir $(bin_PROGRAMS)), printf " %s\n\t%s.exe" \\ $(prog) >> $@; ) + @echo '' >> $@ + @echo '' >> $@ + @echo 'progs: lib $$(progs)' >> $@ + @$(foreach p, $(notdir $(bin_PROGRAMS)), \ + printf "%s.exe: %s\n\tlink /out:\$$@ \$$(LINKFLAGS) isa-l_crypto.lib \$$?\n" $(p) $(subst /,\\,$(programs_$(p)_SOURCES:.c=.obj)) >> $@; ) + @echo '' >> $@ + @echo 'clean:' >> $@ + @echo ' -if exist *.obj del *.obj' >> $@ + @echo ' -if exist bin\*.obj del bin\*.obj' >> $@ + @echo ' -if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib' >> $@ + @echo ' -if exist *.exe del *.exe' >> $@ + @echo ' -if exist *.pdb del *.pdb' >> $@ + @echo ' -if exist isa-l_crypto.lib del isa-l_crypto.lib' >> $@ + @echo ' -if exist isa-l_crypto.dll del isa-l_crypto.dll' >> $@ + @echo ' -if exist isa-l_crypto.exp del isa-l_crypto.exp' >> $@ + @echo '' >> $@ + @echo 'libcrypto.lib:' >> $@ + @cat $(foreach unit,$(units), $(unit)/Makefile.am) | sed \ + -e '/: /!d' \ + -e 's/\([^ :]*\)[ ]*/\1.exe /g' \ + -e :c -e 's/:\(.*\).exe/:\1/;tc' \ + -e 's/\.o[ $$]/.obj /g' \ + -e 's/\.o\.exe[ ]:/.obj:/g' \ + -e '/CFLAGS_.*+=/d' \ + -e '/:.*\%.*:/d' \ + -e 's/ :/:/' \ + -e 's/LDLIBS *+=//' \ + -e 's/-lcrypto/libcrypto.lib/' \ + -e 's/ $$//' \ + >> $@ diff --git a/src/crypto/isa-l/isa-l_crypto/tools/iindent b/src/crypto/isa-l/isa-l_crypto/tools/iindent new file mode 100755 index 000000000..48d26360f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/iindent @@ -0,0 +1,2 @@ +#!/bin/sh +indent -linux -l95 -cp1 -lps -il6 -ncs "$@" diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh new file mode 100755 index 000000000..19e03856c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh @@ -0,0 +1,56 @@ +#/bin/sh + +# Filter out unnecessary options added by automake + +while [ -n "$*" ]; do + case "$1" in + -o ) + # Supported options with arg + options="$options $1 $2" + shift + object="$1" + shift + ;; + -f | -D ) + # Supported options with arg + options="$options $1 $2" + shift + shift + ;; + -I | -i ) + options="$options $1 $2/" + shift + shift + ;; + --prefix* ) + # Supported options without arg + options="$options $1" + shift + ;; + -I* | -i* ) + options="$options $1/" + shift + ;; + -D* ) # For defines we need to remove spaces + case "$1" in + *' '* ) ;; + *) options="$options $1" ;; + esac + shift + ;; + #-blah ) + # Unsupported options with args - none known + -* ) + # Unsupported options with no args + shift + ;; + * ) + args="$args $1" + shift + ;; + esac +done + +nasm $options $args +$CET_LD -r -z ibt -z shstk -o $object.tmp $object +mv $object.tmp $object diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh new file mode 100755 index 000000000..5ec9ba3f3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh @@ -0,0 +1,47 @@ +#/bin/sh + +# Filter out unnecessary options added by automake + +while [ -n "$*" ]; do + case "$1" in + -f | -o | -D ) + # Supported options with arg + options="$options $1 $2" + shift + shift + ;; + -I | -i ) + options="$options $1 $2/" + shift + shift + ;; + --prefix* ) + # Supported options without arg + options="$options $1" + shift + ;; + -I* | -i* ) + options="$options $1/" + shift + ;; + -D* ) # For defines we need to remove spaces + case "$1" in + *' '* ) ;; + *) options="$options $1" ;; + esac + shift + ;; + #-blah ) + # Unsupported options with args - none known + -* ) + # Unsupported options with no args + shift + ;; + * ) + args="$args $1" + shift + ;; + esac +done + +nasm $options $args diff --git a/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh new file mode 100755 index 000000000..bb82b9fa5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh @@ -0,0 +1,2 @@ +#!/bin/sh +sed -i -i.bak 's/[[:blank:]]*$//' "$@" diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh new file mode 100755 index 000000000..756e1e069 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -e #exit on fail + +# Override defaults if exist +READLINK=readlink +command -V greadlink >/dev/null 2>&1 && READLINK=greadlink + + +out="$PWD" +src=$($READLINK -f $(dirname $0))/.. +cd "$src" + +# Echo environment info +if test -d .git; then + branch=$(git describe) + commitid=$(git rev-parse HEAD) + brief=$(git log -1 --format='%s') + branch_changes=$(git diff --shortstat) +fi +if command -V uname >/dev/null 2>&1; then + node=$(uname -n) + os_name=$(uname -s) + os_all=$(uname -a) +fi + +echo "Test report v1" +echo "branch: $branch" +echo "brief: $brief" +echo "commitid: $commitid" +echo "node: $node" +echo "os_name: $os_name" +echo "os_all: $os_all" +echo "test_args: $@" +echo "changes: $branch_changes" +command -V lscpu > /dev/null 2>&1 && lscpu + +# Start tests + +[ -z "$1" ] && ./tools/test_checks.sh + +while [ -n "$1" ]; do + case "$1" in + check ) + ./tools/test_checks.sh + shift ;; + ext ) + ./tools/test_extended.sh + shift ;; + format ) + shift ;; + all ) + ./tools/test_checks.sh + ./tools/test_extended.sh + shift ;; + * ) + echo $0 undefined option: $1 + shift ;; + esac +done + +./tools/check_format.sh + diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh new file mode 100755 index 000000000..9573554db --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +set -xe #exit on fail + +# Defaults +cpus=1 +S=$RANDOM +MAKE=make +READLINK=readlink + +# Override defaults if exist +command -V gmake >/dev/null 2>&1 && MAKE=gmake +command -V greadlink >/dev/null 2>&1 && READLINK=greadlink + +out="$PWD" +src=$($READLINK -f $(dirname $0))/.. +source $src/tools/test_tools.sh +cd "$src" +tmp_install_dir=$out/tmp_install + +# Run on mult cpus +if command -V lscpu >/dev/null 2>&1; then + cpus=`lscpu -p | tail -1 | cut -d, -f 2` + cpus=$(($cpus + 1)) +elif command -V sysctl; then + if sysctl -n hw.ncpu >/dev/null 2>&1; then + cpus=$(sysctl -n hw.ncpu) + cpus=$(($cpus + 1)) + fi +fi +echo "Using $cpus cpu threads" + +# Pick a random test seed +if [ -z "$S" ]; then + S=`tr -cd 0-9 /dev/null || S="123" +fi +echo "Running with TEST_SEED=$S" + +# Fix Darwin issues +if uname | grep -q 'Darwin' 2>&1; then + export SED=`which sed` + opt_config_target='--target=darwin' +fi + +# Tests +time ./autogen.sh +time ./configure --prefix=$tmp_install_dir $opt_config_target +time $MAKE -j $cpus +test_start "check_tests" +time $MAKE check -j $cpus D="-D TEST_SEED=$S" +test_end "check_tests" $? +test_start "installation_test" +time $MAKE install +test_end "installation_test" $? + +# Check for gnu executable stack set +if command -V readelf >/dev/null 2>&1; then + if readelf -W -l $tmp_install_dir/lib/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then + echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Fail + exit 1 + else + echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Pass + fi +else + echo Stack NX check not supported +fi + +$MAKE clean + + + +echo $0: Pass diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh new file mode 100755 index 000000000..b79cbb0c1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash + +# Extended tests: Run a few more options other than make check + +set -xe #exit on fail + +# Defaults +cpus=1 +S=$RANDOM +MAKE=make +READLINK=readlink +test_level=check +build_opt='' +msg='' + +# Override defaults if exist +command -V gmake >/dev/null 2>&1 && MAKE=gmake +command -V greadlink >/dev/null 2>&1 && READLINK=greadlink +[ -n "$CC" ] && build_opt+="CC=$CC " +[ -n "$AS" ] && build_opt+="AS=$AS " + +out="$PWD" +src=$($READLINK -f $(dirname $0))/.. +source $src/tools/test_tools.sh +cd "$src" + +# Run on mult cpus +if command -V lscpu >/dev/null 2>&1; then + cpus=`lscpu -p | tail -1 | cut -d, -f 2` + cpus=$(($cpus + 1)) +elif command -V sysctl; then + if sysctl -n hw.ncpu >/dev/null 2>&1; then + cpus=$(sysctl -n hw.ncpu) + cpus=$(($cpus + 1)) + fi +fi +echo "Using $cpus cpu threads" + +if [ -z "$S" ]; then + S=`tr -cd 0-9 /dev/null || S="123" +fi +msg+="Running with TEST_SEED=$S".$'\n' + +# Fix Darwin issues +if uname | grep -q 'Darwin' 2>&1; then + export SED=`which sed` +fi + +# Check for test libs to add +if command -V ldconfig >/dev/null 2>&1; then + if ldconfig -p | grep -q libcrypto.so; then + test_level=test + msg+=$'With extra tests\n' + fi + if ldconfig -p | grep -q libefence.so; then + build_opt+="LDFLAGS+='-lefence' " + msg+=$'With efence\n' + fi +fi + +# Std makefile build test +$MAKE -f Makefile.unx clean +test_start "extended_build_test" +time $MAKE -f Makefile.unx -j $cpus $build_opt +test_end "extended_build_test" $? +msg+=$'Std makefile build: Pass\n' + +# Check for gnu executable stack set +if command -V readelf >/dev/null 2>&1; then + test_start "stack_nx_check" + if readelf -W -l bin/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then + echo $0: Stack NX check bin/libisal_crypto.so: Fail + test_end "stack_nx_check" 1 + exit 1 + else + test_end "stack_nx_check" 0 + msg+=$'Stack NX check bin/lib/libisal_crypto.so: Pass\n' + fi +else + msg+=$'Stack NX check not supported: Skip\n' +fi + +# Std makefile build perf tests +test_start "extended_perf_test" +time $MAKE -f Makefile.unx -j $cpus perfs +test_end "extended_perf_test" $? +msg+=$'Std makefile build perf: Pass\n' + +# Std makefile run tests +test_start "extended_makefile_tests" +time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" $test_level +test_end "extended_makefile_tests" $? +msg+=$'Std makefile tests: Pass\n' + +# Std makefile build other +test_start "extended_other_tests" +time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" other +test_end "extended_other_tests" $? +msg+=$'Other tests build: Pass\n' + +$MAKE -f Makefile.unx clean + +# Std makefile run tests with NT_LDST +test_start "extended_makefile_tests with NT_LDST" +$MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST" +time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST" $test_level +test_end "extended_makefile_tests with NT_LDST" $? +msg+=$'Std makefile tests: Pass\n' + +$MAKE -f Makefile.unx clean + +# noarch makefile run tests +test_start "extended_makefile_tests" +time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \ + arch=noarch +time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \ + arch=noarch $test_level +test_end "extended_makefile_tests" $? +msg+=$'noarch makefile tests: Pass\n' + +set +x +echo +echo "Summary test $0:" +echo "Build opt: $build_opt" +echo "$msg" +echo "$0: Final: Pass" diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh new file mode 100755 index 000000000..448b1f92b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +function test_start() +{ + echo "entering test: $1" +} + +function test_end() +{ + echo "leaving test: $1 status: $2" +} diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh new file mode 100755 index 000000000..d7b3e973d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh @@ -0,0 +1,47 @@ +#/bin/sh + +# Filter out unnecessary options added by automake + +while [ -n "$*" ]; do + case "$1" in + -o ) + # Supported options with arg + options="$options $1 $2" + shift + object="$1" + shift + ;; + -f | -I | -i | -D ) + # Supported options with arg + options="$options $1 $2" + shift + shift + ;; + -I* | -i* | --prefix* ) + # Supported options without arg + options="$options $1" + shift + ;; + -D* ) # For defines we need to remove spaces + case "$1" in + *' '* ) ;; + *) options="$options $1" ;; + esac + shift + ;; + #-blah ) + # Unsupported options with args - none known + -* ) + # Unsupported options with no args + shift + ;; + * ) + args="$args $1" + shift + ;; + esac +done + +yasm $options $args +$CET_LD -r -z ibt -z shstk -o $object.tmp $object +mv $object.tmp $object diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh new file mode 100755 index 000000000..c33952a40 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh @@ -0,0 +1,38 @@ +#/bin/sh + +# Filter out unnecessary options added by automake + +while [ -n "$*" ]; do + case "$1" in + -f | -o | -I | -i | -D ) + # Supported options with arg + options="$options $1 $2" + shift + shift + ;; + -I* | -i* | --prefix* ) + # Supported options without arg + options="$options $1" + shift + ;; + -D* ) # For defines we need to remove spaces + case "$1" in + *' '* ) ;; + *) options="$options $1" ;; + esac + shift + ;; + #-blah ) + # Unsupported options with args - none known + -* ) + # Unsupported options with no args + shift + ;; + * ) + args="$args $1" + shift + ;; + esac +done + +yasm $options $args diff --git a/src/crypto/isa-l/isal_crypto_accel.cc b/src/crypto/isa-l/isal_crypto_accel.cc new file mode 100644 index 000000000..7dccf64fd --- /dev/null +++ b/src/crypto/isa-l/isal_crypto_accel.cc @@ -0,0 +1,43 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include "crypto/isa-l/isal_crypto_accel.h" + +#include "crypto/isa-l/isa-l_crypto/include/aes_cbc.h" + +bool ISALCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + alignas(16) struct cbc_key_data keys_blk; + aes_cbc_precomp(const_cast(&key[0]), AES_256_KEYSIZE, &keys_blk); + aes_cbc_enc_256(const_cast(in), + const_cast(&iv[0]), keys_blk.enc_keys, out, size); + return true; +} +bool ISALCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + alignas(16) struct cbc_key_data keys_blk; + aes_cbc_precomp(const_cast(&key[0]), AES_256_KEYSIZE, &keys_blk); + aes_cbc_dec_256(const_cast(in), const_cast(&iv[0]), keys_blk.dec_keys, out, size); + return true; +} diff --git a/src/crypto/isa-l/isal_crypto_accel.h b/src/crypto/isa-l/isal_crypto_accel.h new file mode 100644 index 000000000..84331bbdd --- /dev/null +++ b/src/crypto/isa-l/isal_crypto_accel.h @@ -0,0 +1,31 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef ISAL_CRYPTO_ACCEL_H +#define ISAL_CRYPTO_ACCEL_H +#include "crypto/crypto_accel.h" + +class ISALCryptoAccel : public CryptoAccel { + public: + ISALCryptoAccel() {} + virtual ~ISALCryptoAccel() {} + + bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; + bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; +}; +#endif diff --git a/src/crypto/isa-l/isal_crypto_plugin.cc b/src/crypto/isa-l/isal_crypto_plugin.cc new file mode 100644 index 000000000..85f0e5f0f --- /dev/null +++ b/src/crypto/isa-l/isal_crypto_plugin.cc @@ -0,0 +1,34 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + + +// ----------------------------------------------------------------------------- +#include "crypto/isa-l/isal_crypto_plugin.h" + +#include "ceph_ver.h" +// ----------------------------------------------------------------------------- + +const char *__ceph_plugin_version() +{ + return CEPH_GIT_NICE_VER; +} + +int __ceph_plugin_init(CephContext *cct, + const std::string& type, + const std::string& name) +{ + auto instance = cct->get_plugin_registry(); + + return instance->add(type, name, new ISALCryptoPlugin(cct)); +} diff --git a/src/crypto/isa-l/isal_crypto_plugin.h b/src/crypto/isa-l/isal_crypto_plugin.h new file mode 100644 index 000000000..68e782e69 --- /dev/null +++ b/src/crypto/isa-l/isal_crypto_plugin.h @@ -0,0 +1,47 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Mirantis, Inc. + * + * Author: Adam Kupczyk + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef ISAL_CRYPTO_PLUGIN_H +#define ISAL_CRYPTO_PLUGIN_H +// ----------------------------------------------------------------------------- +#include "crypto/crypto_plugin.h" +#include "crypto/isa-l/isal_crypto_accel.h" +#include "arch/intel.h" +#include "arch/probe.h" +// ----------------------------------------------------------------------------- + + +class ISALCryptoPlugin : public CryptoPlugin { + +public: + + explicit ISALCryptoPlugin(CephContext* cct) : CryptoPlugin(cct) + {} + ~ISALCryptoPlugin() + {} + virtual int factory(CryptoAccelRef *cs, + std::ostream *ss) + { + if (cryptoaccel == nullptr) + { + ceph_arch_probe(); + if (ceph_arch_intel_aesni && ceph_arch_intel_sse41) { + cryptoaccel = CryptoAccelRef(new ISALCryptoAccel); + } + } + *cs = cryptoaccel; + return 0; + } +}; +#endif diff --git a/src/crypto/openssl/CMakeLists.txt b/src/crypto/openssl/CMakeLists.txt new file mode 100644 index 000000000..6ede1567f --- /dev/null +++ b/src/crypto/openssl/CMakeLists.txt @@ -0,0 +1,14 @@ +## openssl + +set(openssl_crypto_plugin_srcs + openssl_crypto_accel.cc + openssl_crypto_plugin.cc) + +add_library(ceph_crypto_openssl SHARED ${openssl_crypto_plugin_srcs}) +target_link_libraries(ceph_crypto_openssl + PRIVATE OpenSSL::Crypto + $<$:ceph-common>) +target_include_directories(ceph_crypto_openssl PRIVATE ${OPENSSL_INCLUDE_DIR}) +add_dependencies(crypto_plugins ceph_crypto_openssl) +set_target_properties(ceph_crypto_openssl PROPERTIES INSTALL_RPATH "") +install(TARGETS ceph_crypto_openssl DESTINATION ${crypto_plugin_dir}) diff --git a/src/crypto/openssl/openssl_crypto_accel.cc b/src/crypto/openssl/openssl_crypto_accel.cc new file mode 100644 index 000000000..e6ea0fa72 --- /dev/null +++ b/src/crypto/openssl/openssl_crypto_accel.cc @@ -0,0 +1,104 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include "crypto/openssl/openssl_crypto_accel.h" +#include +#include +#include "common/debug.h" + +// ----------------------------------------------------------------------------- +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_crypto +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +static std::ostream& +_prefix(std::ostream* _dout) +{ + return *_dout << "OpensslCryptoAccel: "; +} +// ----------------------------------------------------------------------------- + +#define EVP_SUCCESS 1 +#define AES_ENCRYPT 1 +#define AES_DECRYPT 0 + +bool evp_transform(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char* iv, + const unsigned char* key, + ENGINE* engine, + const EVP_CIPHER* const type, + const int encrypt) +{ + using pctx_t = std::unique_ptr; + pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free }; + + if (!pctx) { + derr << "failed to create evp cipher context" << dendl; + return false; + } + + if (EVP_CipherInit_ex(pctx.get(), type, engine, key, iv, encrypt) != EVP_SUCCESS) { + derr << "EVP_CipherInit_ex failed" << dendl; + return false; + } + + if (EVP_CIPHER_CTX_set_padding(pctx.get(), 0) != EVP_SUCCESS) { + derr << "failed to disable PKCS padding" << dendl; + return false; + } + + int len_update = 0; + if (EVP_CipherUpdate(pctx.get(), out, &len_update, in, size) != EVP_SUCCESS) { + derr << "EVP_CipherUpdate failed" << dendl; + return false; + } + + int len_final = 0; + if (EVP_CipherFinal_ex(pctx.get(), out + len_update, &len_final) != EVP_SUCCESS) { + derr << "EVP_CipherFinal_ex failed" << dendl; + return false; + } + + ceph_assert(len_final == 0); + return (len_update + len_final) == static_cast(size); +} + +bool OpenSSLCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + + return evp_transform(out, in, size, const_cast(&iv[0]), + const_cast(&key[0]), + nullptr, // Hardware acceleration engine can be used in the future + EVP_aes_256_cbc(), AES_ENCRYPT); +} + +bool OpenSSLCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + + return evp_transform(out, in, size, const_cast(&iv[0]), + const_cast(&key[0]), + nullptr, // Hardware acceleration engine can be used in the future + EVP_aes_256_cbc(), AES_DECRYPT); +} diff --git a/src/crypto/openssl/openssl_crypto_accel.h b/src/crypto/openssl/openssl_crypto_accel.h new file mode 100644 index 000000000..ad90cbece --- /dev/null +++ b/src/crypto/openssl/openssl_crypto_accel.h @@ -0,0 +1,32 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef OPENSSL_CRYPTO_ACCEL_H +#define OPENSSL_CRYPTO_ACCEL_H + +#include "crypto/crypto_accel.h" + +class OpenSSLCryptoAccel : public CryptoAccel { + public: + OpenSSLCryptoAccel() {} + virtual ~OpenSSLCryptoAccel() {} + + bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; + bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; +}; +#endif diff --git a/src/crypto/openssl/openssl_crypto_plugin.cc b/src/crypto/openssl/openssl_crypto_plugin.cc new file mode 100644 index 000000000..e6ecea2fd --- /dev/null +++ b/src/crypto/openssl/openssl_crypto_plugin.cc @@ -0,0 +1,32 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + + +#include "crypto/openssl/openssl_crypto_plugin.h" + +#include "ceph_ver.h" + +const char *__ceph_plugin_version() +{ + return CEPH_GIT_NICE_VER; +} + +int __ceph_plugin_init(CephContext *cct, + const std::string& type, + const std::string& name) +{ + auto instance = cct->get_plugin_registry(); + + return instance->add(type, name, new OpenSSLCryptoPlugin(cct)); +} diff --git a/src/crypto/openssl/openssl_crypto_plugin.h b/src/crypto/openssl/openssl_crypto_plugin.h new file mode 100644 index 000000000..408d9ebda --- /dev/null +++ b/src/crypto/openssl/openssl_crypto_plugin.h @@ -0,0 +1,36 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef ISAL_CRYPTO_PLUGIN_H +#define ISAL_CRYPTO_PLUGIN_H + +#include "crypto/crypto_plugin.h" +#include "crypto/openssl/openssl_crypto_accel.h" + + +class OpenSSLCryptoPlugin : public CryptoPlugin { + + CryptoAccelRef cryptoaccel; +public: + explicit OpenSSLCryptoPlugin(CephContext* cct) : CryptoPlugin(cct) + {} + int factory(CryptoAccelRef *cs, std::ostream *ss) override { + if (cryptoaccel == nullptr) + cryptoaccel = CryptoAccelRef(new OpenSSLCryptoAccel); + + *cs = cryptoaccel; + return 0; + } +}; +#endif diff --git a/src/crypto/qat/CMakeLists.txt b/src/crypto/qat/CMakeLists.txt new file mode 100644 index 000000000..fb751967a --- /dev/null +++ b/src/crypto/qat/CMakeLists.txt @@ -0,0 +1,20 @@ +## +# QAT wrapper for Ceph +## + +set(qat_crypto_plugin_srcs + qat_crypto_accel.cc + qat_crypto_plugin.cc + qcccrypto.cc) + +add_library(ceph_crypto_qat SHARED ${qat_crypto_plugin_srcs}) + +add_dependencies(crypto_plugins ceph_crypto_qat) + +target_link_libraries(ceph_crypto_qat PRIVATE + QatDrv::qat_s + QatDrv::usdm_drv_s) + +add_dependencies(crypto_plugins ceph_crypto_qat) +set_target_properties(ceph_crypto_qat PROPERTIES VERSION 1.0.0 SOVERSION 1) +install(TARGETS ceph_crypto_qat DESTINATION ${crypto_plugin_dir}) diff --git a/src/crypto/qat/qat_crypto_accel.cc b/src/crypto/qat/qat_crypto_accel.cc new file mode 100644 index 000000000..23f86edfa --- /dev/null +++ b/src/crypto/qat/qat_crypto_accel.cc @@ -0,0 +1,42 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * Author: Ganesh Mahalingam + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include "crypto/qat/qat_crypto_accel.h" + +bool QccCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + + return qcccrypto.perform_op(out, in, size, + const_cast(&iv[0]), + const_cast(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT); +} + +bool QccCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) +{ + if ((size % AES_256_IVSIZE) != 0) { + return false; + } + + return qcccrypto.perform_op(out, in, size, + const_cast(&iv[0]), + const_cast(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT); +} diff --git a/src/crypto/qat/qat_crypto_accel.h b/src/crypto/qat/qat_crypto_accel.h new file mode 100644 index 000000000..5badefc28 --- /dev/null +++ b/src/crypto/qat/qat_crypto_accel.h @@ -0,0 +1,35 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * Author: Ganesh Mahalingam + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef QAT_CRYPTO_ACCEL_H +#define QAT_CRYPTO_ACCEL_H + +#include "crypto/crypto_accel.h" +#include "crypto/qat/qcccrypto.h" + +class QccCryptoAccel : public CryptoAccel { + public: + QccCrypto qcccrypto; + QccCryptoAccel() { qcccrypto.init(); }; + ~QccCryptoAccel() { qcccrypto.destroy(); }; + + bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; + bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size, + const unsigned char (&iv)[AES_256_IVSIZE], + const unsigned char (&key)[AES_256_KEYSIZE]) override; +}; +#endif diff --git a/src/crypto/qat/qat_crypto_plugin.cc b/src/crypto/qat/qat_crypto_plugin.cc new file mode 100644 index 000000000..4bf3d61bb --- /dev/null +++ b/src/crypto/qat/qat_crypto_plugin.cc @@ -0,0 +1,35 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * Author: Ganesh Mahalingam + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + + +#include "crypto/qat/qat_crypto_plugin.h" + +#include "ceph_ver.h" + +std::mutex QccCryptoPlugin::qat_init; + +const char *__ceph_plugin_version() +{ + return CEPH_GIT_NICE_VER; +} + +int __ceph_plugin_init(CephContext *cct, + const std::string& type, + const std::string& name) +{ + PluginRegistry *instance = cct->get_plugin_registry(); + + return instance->add(type, name, new QccCryptoPlugin(cct)); +} diff --git a/src/crypto/qat/qat_crypto_plugin.h b/src/crypto/qat/qat_crypto_plugin.h new file mode 100644 index 000000000..a8d4df7cb --- /dev/null +++ b/src/crypto/qat/qat_crypto_plugin.h @@ -0,0 +1,42 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Intel Corporation + * + * Author: Qiaowei Ren + * Author: Ganesh Mahalingam + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef QAT_CRYPTO_PLUGIN_H +#define QAT_CRYPTO_PLUGIN_H + +#include "crypto/crypto_plugin.h" +#include "crypto/qat/qat_crypto_accel.h" + + +class QccCryptoPlugin : public CryptoPlugin { + static std::mutex qat_init; + +public: + + explicit QccCryptoPlugin(CephContext* cct) : CryptoPlugin(cct) + {} + ~QccCryptoPlugin() + {} + virtual int factory(CryptoAccelRef *cs, std::ostream *ss) + { + std::lock_guard l(qat_init); + if (cryptoaccel == nullptr) + cryptoaccel = CryptoAccelRef(new QccCryptoAccel); + + *cs = cryptoaccel; + return 0; + } +}; +#endif diff --git a/src/crypto/qat/qcccrypto.cc b/src/crypto/qat/qcccrypto.cc new file mode 100644 index 000000000..a3f253726 --- /dev/null +++ b/src/crypto/qat/qcccrypto.cc @@ -0,0 +1,471 @@ +#include "qcccrypto.h" +#include +#include "string.h" +#include +#include "common/debug.h" +#include "include/scope_guard.h" +#include "common/dout.h" +#include "common/errno.h" + +// ----------------------------------------------------------------------------- +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +static std::ostream& _prefix(std::ostream* _dout) +{ + return *_dout << "QccCrypto: "; +} +// ----------------------------------------------------------------------------- + +/* + * Poller thread & functions +*/ +static std::mutex qcc_alloc_mutex; +static std::mutex qcc_eng_mutex; +static std::atomic init_called = { false }; + +void* QccCrypto::crypt_thread(void *args) { + struct qcc_thread_args *thread_args = (struct qcc_thread_args *)args; + thread_args->qccinstance->do_crypt(thread_args); + return thread_args; +} + +void QccCrypto::QccFreeInstance(int entry) { + std::lock_guard freeinst(qcc_alloc_mutex); + open_instances.push(entry); +} + +int QccCrypto::QccGetFreeInstance() { + int ret = -1; + std::lock_guard getinst(qcc_alloc_mutex); + if (!open_instances.empty()) { + ret = open_instances.front(); + open_instances.pop(); + } + return ret; +} + +void QccCrypto::cleanup() { + icp_sal_userStop(); + qaeMemDestroy(); + is_init = false; + init_stat = stat; + init_called = false; + derr << "Failure during QAT init sequence. Quitting" << dendl; +} + +/* + * We initialize QAT instance and everything that is common for all ops +*/ +bool QccCrypto::init() +{ + + std::lock_guard l(qcc_eng_mutex); + + if(init_called) { + dout(10) << "Init sequence already called. Skipping duplicate call" << dendl; + return true; + } + + // First call to init + dout(15) << "First init for QAT" << dendl; + init_called = true; + + // Find if the usermode memory driver is available. We need to this to + // create contiguous memory needed by QAT. + stat = qaeMemInit(); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to load memory driver" << dendl; + this->cleanup(); + return false; + } + + stat = icp_sal_userStart("CEPH"); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to start qat device" << dendl; + this->cleanup(); + return false; + } + + qcc_os_mem_alloc((void **)&qcc_inst, sizeof(QCCINST)); + if(qcc_inst == NULL) { + derr << "Unable to alloc mem for instance struct" << dendl; + this->cleanup(); + return false; + } + + // Initialize contents of qcc_inst + qcc_inst->num_instances = 0; + qcc_inst->cy_inst_handles = NULL; + + stat = cpaCyGetNumInstances(&(qcc_inst->num_instances)); + if ((stat != CPA_STATUS_SUCCESS) || (qcc_inst->num_instances <= 0)) { + derr << "Unable to find available instances" << dendl; + this->cleanup(); + return false; + } + + qcc_os_mem_alloc((void **)&qcc_inst->cy_inst_handles, + ((int)qcc_inst->num_instances * sizeof(CpaInstanceHandle))); + if (qcc_inst->cy_inst_handles == NULL) { + derr << "Unable to allocate instances array memory" << dendl; + this->cleanup(); + return false; + } + + stat = cpaCyGetInstances(qcc_inst->num_instances, qcc_inst->cy_inst_handles); + if (stat != CPA_STATUS_SUCCESS) { + derr << "Unable to get instances" << dendl; + this->cleanup(); + return false; + } + + int iter = 0; + //Start Instances + for(iter = 0; iter < qcc_inst->num_instances; iter++) { + stat = cpaCyStartInstance(qcc_inst->cy_inst_handles[iter]); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to start instance" << dendl; + this->cleanup(); + return false; + } + } + + qcc_os_mem_alloc((void **)&qcc_inst->is_polled, + ((int)qcc_inst->num_instances * sizeof(CpaBoolean))); + CpaInstanceInfo2 info; + for(iter = 0; iter < qcc_inst->num_instances; iter++) { + qcc_inst->is_polled[iter] = cpaCyInstanceGetInfo2(qcc_inst->cy_inst_handles[iter], + &info) == CPA_STATUS_SUCCESS ? info.isPolled : CPA_FALSE; + } + + // Allocate memory structures for all instances + qcc_os_mem_alloc((void **)&qcc_sess, + ((int)qcc_inst->num_instances * sizeof(QCCSESS))); + if(qcc_sess == NULL) { + derr << "Unable to allocate memory for session struct" << dendl; + this->cleanup(); + return false; + } + + qcc_os_mem_alloc((void **)&qcc_op_mem, + ((int)qcc_inst->num_instances * sizeof(QCCOPMEM))); + if(qcc_sess == NULL) { + derr << "Unable to allocate memory for opmem struct" << dendl; + this->cleanup(); + return false; + } + + qcc_os_mem_alloc((void **)&cypollthreads, + ((int)qcc_inst->num_instances * sizeof(pthread_t))); + if(cypollthreads == NULL) { + derr << "Unable to allocate memory for pthreads" << dendl; + this->cleanup(); + return false; + } + + //At this point we are only doing an user-space version. + //To-Do: Maybe a kernel based one + for(iter = 0; iter < qcc_inst->num_instances; iter++) { + stat = cpaCySetAddressTranslation(qcc_inst->cy_inst_handles[iter], + qaeVirtToPhysNUMA); + if(stat == CPA_STATUS_SUCCESS) { + // Start HW Polling Thread + // To-Do: Enable epoll & interrupt based later? + // QccCyStartPoll(iter); + // Setup the session structures for crypto operation and populate + // whatever we can now. Rest will be filled in when crypto operation + // happens. + qcc_sess[iter].sess_ctx_sz = 0; + qcc_sess[iter].sess_ctx = NULL; + qcc_sess[iter].sess_stp_data.sessionPriority = CPA_CY_PRIORITY_NORMAL; + qcc_sess[iter].sess_stp_data.symOperation = CPA_CY_SYM_OP_CIPHER; + open_instances.push(iter); + qcc_op_mem[iter].is_mem_alloc = false; + qcc_op_mem[iter].op_complete = false; + qcc_op_mem[iter].op_result = CPA_STATUS_SUCCESS; + qcc_op_mem[iter].sym_op_data = NULL; + qcc_op_mem[iter].buff_meta_size = qcc_op_mem[iter].buff_size = 0; + qcc_op_mem[iter].src_buff_meta = qcc_op_mem[iter].src_buff + = qcc_op_mem[iter].iv_buff = NULL; + qcc_op_mem[iter].src_buff_list = NULL; + qcc_op_mem[iter].src_buff_flat = NULL; + qcc_op_mem[iter].num_buffers = 1; + } else { + derr << "Unable to find address translations of instance " << iter << dendl; + this->cleanup(); + return false; + } + } + is_init = true; + dout(10) << "Init complete" << dendl; + return true; +} + +bool QccCrypto::destroy() { + if((!is_init) || (!init_called)) { + dout(15) << "QAT not initialized here. Nothing to do" << dendl; + return false; + } + + unsigned int retry = 0; + while(retry <= QCC_MAX_RETRIES) { + if(open_instances.size() == qcc_inst->num_instances) { + break; + } else { + retry++; + } + dout(5) << "QAT is still busy and cannot free resources yet" << dendl; + return false; + } + + dout(10) << "Destroying QAT crypto & related memory" << dendl; + int iter = 0; + + // Free up op related memory + for (iter =0; iter < qcc_inst->num_instances; iter++) { + qcc_contig_mem_free((void **)&(qcc_op_mem[iter].src_buff)); + qcc_contig_mem_free((void **)&(qcc_op_mem[iter].iv_buff)); + qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_list)); + qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_flat)); + qcc_contig_mem_free((void **)&(qcc_op_mem[iter].sym_op_data)); + } + + // Free up Session memory + for(iter = 0; iter < qcc_inst->num_instances; iter++) { + cpaCySymRemoveSession(qcc_inst->cy_inst_handles[iter], qcc_sess[iter].sess_ctx); + qcc_contig_mem_free((void **)&(qcc_sess[iter].sess_ctx)); + } + + // Stop QAT Instances + for(iter = 0; iter < qcc_inst->num_instances; iter++) { + cpaCyStopInstance(qcc_inst->cy_inst_handles[iter]); + } + + // Free up the base structures we use + qcc_os_mem_free((void **)&qcc_op_mem); + qcc_os_mem_free((void **)&qcc_sess); + qcc_os_mem_free((void **)&(qcc_inst->cy_inst_handles)); + qcc_os_mem_free((void **)&(qcc_inst->is_polled)); + qcc_os_mem_free((void **)&cypollthreads); + qcc_os_mem_free((void **)&qcc_inst); + + //Un-init memory driver and QAT HW + icp_sal_userStop(); + qaeMemDestroy(); + init_called = false; + is_init = false; + return true; +} + +void QccCrypto::do_crypt(qcc_thread_args *thread_args) { + auto entry = thread_args->entry; + qcc_op_mem[entry].op_result = cpaCySymPerformOp(qcc_inst->cy_inst_handles[entry], + NULL, + qcc_op_mem[entry].sym_op_data, + qcc_op_mem[entry].src_buff_list, + qcc_op_mem[entry].src_buff_list, + NULL); + qcc_op_mem[entry].op_complete = true; + free(thread_args); +} + +bool QccCrypto::perform_op(unsigned char* out, const unsigned char* in, + size_t size, uint8_t *iv, uint8_t *key, CpaCySymCipherDirection op_type) +{ + if (!init_called) { + dout(10) << "QAT not intialized yet. Initializing now..." << dendl; + if(!QccCrypto::init()) { + derr << "QAT init failed" << dendl; + return false; + } + } + + if(!is_init) + { + dout(10) << "QAT not initialized in this instance or init failed with possible error " << (int)init_stat << dendl; + return is_init; + } + + int avail_inst = -1; + unsigned int retrycount = 0; + while(retrycount <= QCC_MAX_RETRIES) { + avail_inst = QccGetFreeInstance(); + if(avail_inst != -1) { + break; + } else { + retrycount++; + usleep(qcc_sleep_duration); + } + } + + if(avail_inst == -1) { + derr << "Unable to get an QAT instance. Failing request" << dendl; + return false; + } + + dout(15) << "Using inst " << avail_inst << dendl; + // Start polling threads for this instance + //QccCyStartPoll(avail_inst); + + auto sg = make_scope_guard([=] { + //free up the instance irrespective of the op status + dout(15) << "Completed task under " << avail_inst << dendl; + qcc_op_mem[avail_inst].op_complete = false; + QccCrypto::QccFreeInstance(avail_inst); + }); + + /* + * Allocate buffers for this version of the instance if not already done. + * Hold onto to most of them until destructor is called. + */ + if (qcc_op_mem[avail_inst].is_mem_alloc == false) { + + qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherAlgorithm = + CPA_CY_SYM_CIPHER_AES_CBC; + qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherKeyLenInBytes = + AES_256_KEY_SIZE; + + // Allocate contig memory for buffers that are independent of the + // input/output + stat = cpaCyBufferListGetMetaSize(qcc_inst->cy_inst_handles[avail_inst], + qcc_op_mem[avail_inst].num_buffers, &(qcc_op_mem[avail_inst].buff_meta_size)); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to get buff meta size" << dendl; + return false; + } + + // Allocate Buffer List Private metadata + stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_meta), + qcc_op_mem[avail_inst].buff_meta_size, 1); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to allocate private metadata memory" << dendl; + return false; + } + + // Allocate Buffer List Memory + qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_list), sizeof(CpaBufferList)); + qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_flat), + (qcc_op_mem[avail_inst].num_buffers * sizeof(CpaFlatBuffer))); + if(qcc_op_mem[avail_inst].src_buff_list == NULL || qcc_op_mem[avail_inst].src_buff_flat == NULL) { + derr << "Unable to allocate bufferlist memory" << dendl; + return false; + } + + // Allocate IV memory + stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].iv_buff), AES_256_IV_LEN); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to allocate bufferlist memory" << dendl; + return false; + } + + //Assign src stuff for the operation + (qcc_op_mem[avail_inst].src_buff_list)->pBuffers = qcc_op_mem[avail_inst].src_buff_flat; + (qcc_op_mem[avail_inst].src_buff_list)->numBuffers = qcc_op_mem[avail_inst].num_buffers; + (qcc_op_mem[avail_inst].src_buff_list)->pPrivateMetaData = qcc_op_mem[avail_inst].src_buff_meta; + + //Setup OpData + stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].sym_op_data), + sizeof(CpaCySymOpData)); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to allocate opdata memory" << dendl; + return false; + } + + // Assuming op to be encryption for initiation. This will be reset when we + // exit this block + qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; + // Allocate Session memory + stat = cpaCySymSessionCtxGetSize(qcc_inst->cy_inst_handles[avail_inst], + &(qcc_sess[avail_inst].sess_stp_data), &(qcc_sess[avail_inst].sess_ctx_sz)); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to find session size" << dendl; + return false; + } + + stat = qcc_contig_mem_alloc((void **)&(qcc_sess[avail_inst].sess_ctx), + qcc_sess[avail_inst].sess_ctx_sz); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to allocate contig memory" << dendl; + return false; + } + + // Set memalloc flag so that we don't go through this exercise again. + qcc_op_mem[avail_inst].is_mem_alloc = true; + dout(15) << "Instantiation complete for " << avail_inst << dendl; + } + + // Section that runs on every call + // Identify the operation and assign to session + qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection = op_type; + qcc_sess[avail_inst].sess_stp_data.cipherSetupData.pCipherKey = (Cpa8U *)key; + + stat = cpaCySymInitSession(qcc_inst->cy_inst_handles[avail_inst], + NULL, + &(qcc_sess[avail_inst].sess_stp_data), + qcc_sess[avail_inst].sess_ctx); + if (stat != CPA_STATUS_SUCCESS) { + derr << "Unable to init session" << dendl; + return false; + } + + // Allocate actual buffers that will hold data + if (qcc_op_mem[avail_inst].buff_size != (Cpa32U)size) { + qcc_contig_mem_free((void **)&(qcc_op_mem[avail_inst].src_buff)); + qcc_op_mem[avail_inst].buff_size = (Cpa32U)size; + stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff), + qcc_op_mem[avail_inst].buff_size); + if(stat != CPA_STATUS_SUCCESS) { + derr << "Unable to allocate contig memory" << dendl; + return false; + } + } + + // Copy src & iv into the respective buffers + memcpy(qcc_op_mem[avail_inst].src_buff, in, size); + memcpy(qcc_op_mem[avail_inst].iv_buff, iv, AES_256_IV_LEN); + + //Assign the reminder of the stuff + qcc_op_mem[avail_inst].src_buff_flat->dataLenInBytes = qcc_op_mem[avail_inst].buff_size; + qcc_op_mem[avail_inst].src_buff_flat->pData = qcc_op_mem[avail_inst].src_buff; + + //OpData assignment + qcc_op_mem[avail_inst].sym_op_data->sessionCtx = qcc_sess[avail_inst].sess_ctx; + qcc_op_mem[avail_inst].sym_op_data->packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + qcc_op_mem[avail_inst].sym_op_data->pIv = qcc_op_mem[avail_inst].iv_buff; + qcc_op_mem[avail_inst].sym_op_data->ivLenInBytes = AES_256_IV_LEN; + qcc_op_mem[avail_inst].sym_op_data->cryptoStartSrcOffsetInBytes = 0; + qcc_op_mem[avail_inst].sym_op_data->messageLenToCipherInBytes = qcc_op_mem[avail_inst].buff_size; + + // Perform cipher operation in a thread + qcc_thread_args* thread_args = new qcc_thread_args(); + thread_args->qccinstance = this; + thread_args->entry = avail_inst; + + if (pthread_create(&cypollthreads[avail_inst], NULL, crypt_thread, (void *)thread_args) != 0) { + derr << "Unable to create thread for crypt operation" << dendl; + return false; + } + if (qcc_inst->is_polled[avail_inst] == CPA_TRUE) { + while (!qcc_op_mem[avail_inst].op_complete) { + icp_sal_CyPollInstance(qcc_inst->cy_inst_handles[avail_inst], 0); + } + } + pthread_join(cypollthreads[avail_inst], NULL); + + if(qcc_op_mem[avail_inst].op_result != CPA_STATUS_SUCCESS) { + derr << "Unable to perform crypt operation" << dendl; + return false; + } + + //Copy data back to out buffer + memcpy(out, qcc_op_mem[avail_inst].src_buff, size); + //Always cleanup memory holding user-data at the end + memset(qcc_op_mem[avail_inst].iv_buff, 0, AES_256_IV_LEN); + memset(qcc_op_mem[avail_inst].src_buff, 0, qcc_op_mem[avail_inst].buff_size); + + return true; +} diff --git a/src/crypto/qat/qcccrypto.h b/src/crypto/qat/qcccrypto.h new file mode 100644 index 000000000..a36b0898b --- /dev/null +++ b/src/crypto/qat/qcccrypto.h @@ -0,0 +1,176 @@ +#ifndef QCCCRYPTO_H +#define QCCCRYPTO_H + +#include +#include +#include +#include +#include +#include +extern "C" { +#include "cpa.h" +#include "lac/cpa_cy_sym.h" +#include "lac/cpa_cy_im.h" +#include "qae_mem.h" +#include "icp_sal_user.h" +#include "icp_sal_poll.h" +#include "qae_mem_utils.h" +} + +class QccCrypto { + + public: + CpaCySymCipherDirection qcc_op_type; + + QccCrypto() {}; + ~QccCrypto() {}; + + bool init(); + bool destroy(); + bool perform_op(unsigned char* out, const unsigned char* in, size_t size, + uint8_t *iv, + uint8_t *key, + CpaCySymCipherDirection op_type); + + private: + + // Currently only supporting AES_256_CBC. + // To-Do: Needs to be expanded + static const size_t AES_256_IV_LEN = 16; + static const size_t AES_256_KEY_SIZE = 32; + static const size_t QCC_MAX_RETRIES = 5000; + + /* + * Struct to hold an instance of QAT to handle the crypto operations. These + * will be identified at the start and held until the destructor is called + * To-Do: + * The struct was creating assuming that we will use all the instances. + * Expand current implementation to allow multiple instances to operate + * independently. + */ + struct QCCINST { + CpaInstanceHandle *cy_inst_handles; + CpaBoolean *is_polled; + Cpa16U num_instances; + } *qcc_inst; + + /* + * QAT Crypto Session + * Crypto Session Context and setupdata holds + * priority, type of crypto operation (cipher/chained), + * cipher algorithm (AES, DES, etc), + * single crypto or multi-buffer crypto. + */ + struct QCCSESS { + CpaCySymSessionSetupData sess_stp_data; + Cpa32U sess_ctx_sz; + CpaCySymSessionCtx sess_ctx; + } *qcc_sess; + + /* + * Cipher Memory Allocations + * Holds bufferlist, flatbuffer, cipher opration data and buffermeta needed + * by QAT to perform the operation. Also buffers for IV, SRC, DEST. + */ + struct QCCOPMEM { + // Op common items + bool is_mem_alloc; + bool op_complete; + CpaStatus op_result; + CpaCySymOpData *sym_op_data; + Cpa32U buff_meta_size; + Cpa32U num_buffers; + Cpa32U buff_size; + + //Src data items + Cpa8U *src_buff_meta; + CpaBufferList *src_buff_list; + CpaFlatBuffer *src_buff_flat; + Cpa8U *src_buff; + Cpa8U *iv_buff; + } *qcc_op_mem; + + //QAT HW polling thread input structure + struct qcc_thread_args { + QccCrypto* qccinstance; + int entry; + }; + + + /* + * Function to handle the crypt operation. Will run while the main thread + * runs the polling function on the instance doing the op + */ + void do_crypt(qcc_thread_args *thread_args); + + /* + * Handle queue with free instances to handle op + */ + std::queue open_instances; + int QccGetFreeInstance(); + void QccFreeInstance(int entry); + + /* + * Contiguous Memory Allocator and de-allocator. We are using the usdm + * driver that comes along with QAT to get us direct memory access using + * hugepages. + * To-Do: A kernel based one. + */ + static inline void qcc_contig_mem_free(void **ptr) { + if (*ptr) { + qaeMemFreeNUMA(ptr); + *ptr = NULL; + } + } + + static inline CpaStatus qcc_contig_mem_alloc(void **ptr, Cpa32U size, Cpa32U alignment = 1) { + *ptr = qaeMemAllocNUMA(size, 0, alignment); + if (NULL == *ptr) + { + return CPA_STATUS_RESOURCE; + } + return CPA_STATUS_SUCCESS; + } + + /* + * Malloc & free calls masked to maintain consistency and future kernel + * alloc support. + */ + static inline void qcc_os_mem_free(void **ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } + } + + static inline CpaStatus qcc_os_mem_alloc(void **ptr, Cpa32U size) { + *ptr = malloc(size); + if (*ptr == NULL) + { + return CPA_STATUS_RESOURCE; + } + return CPA_STATUS_SUCCESS; + } + + std::atomic is_init = { false }; + CpaStatus init_stat, stat; + + /* + * Function to cleanup memory if constructor fails + */ + void cleanup(); + + /* + * Crypto Polling Function & helpers + * This helps to retrieve data from the QAT rings and dispatching the + * associated callbacks. For synchronous operation (like this one), QAT + * library creates an internal callback for the operation. + */ + static void* crypt_thread(void* entry); + CpaStatus QccCyStartPoll(int entry); + void poll_instance(int entry); + + pthread_t *cypollthreads; + static const size_t qcc_sleep_duration = 2; +}; +#endif //QCCCRYPTO_H -- cgit v1.2.3