From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- .../intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm | 532 ++++ src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm | 31 + .../intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm | 545 +++++ .../intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm | 32 + .../intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm | 590 +++++ .../intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm | 470 ++++ .../intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm | 634 +++++ .../intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm | 483 ++++ src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm | 380 +++ src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm | 349 +++ src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm | 368 +++ src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm | 167 ++ src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm | 654 +++++ .../intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm | 303 +++ src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm | 30 + src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm | 31 + src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm | 31 + src/spdk/intel-ipsec-mb/sse/gcm_sse.asm | 2586 ++++++++++++++++++++ src/spdk/intel-ipsec-mb/sse/kasumi_sse.c | 385 +++ .../intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm | 30 + .../sse/mb_mgr_aes192_submit_sse.asm | 30 + .../intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm | 30 + .../sse/mb_mgr_aes256_submit_sse.asm | 30 + .../sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm | 518 ++++ .../sse/mb_mgr_aes_cmac_submit_flush_sse.asm | 502 ++++ .../intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm | 217 ++ .../intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm | 187 ++ .../sse/mb_mgr_aes_xcbc_flush_sse.asm | 242 ++ .../sse/mb_mgr_aes_xcbc_submit_sse.asm | 263 ++ .../sse/mb_mgr_hmac_flush_ni_sse.asm | 305 +++ .../intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm | 302 +++ .../sse/mb_mgr_hmac_md5_flush_sse.asm | 318 +++ .../sse/mb_mgr_hmac_md5_submit_sse.asm | 356 +++ .../sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm | 28 + .../sse/mb_mgr_hmac_sha_224_flush_sse.asm | 31 + .../sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm | 28 + .../sse/mb_mgr_hmac_sha_224_submit_sse.asm | 31 + .../sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm | 333 +++ .../sse/mb_mgr_hmac_sha_256_flush_sse.asm | 356 +++ .../sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm | 401 +++ .../sse/mb_mgr_hmac_sha_256_submit_sse.asm | 427 ++++ .../sse/mb_mgr_hmac_sha_384_flush_sse.asm | 31 + .../sse/mb_mgr_hmac_sha_384_submit_sse.asm | 31 + .../sse/mb_mgr_hmac_sha_512_flush_sse.asm | 331 +++ .../sse/mb_mgr_hmac_sha_512_submit_sse.asm | 412 ++++ .../sse/mb_mgr_hmac_submit_ni_sse.asm | 370 +++ .../intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm | 364 +++ src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c | 809 ++++++ src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm | 787 ++++++ src/spdk/intel-ipsec-mb/sse/pon_sse.asm | 875 +++++++ src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm | 435 ++++ src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm | 493 ++++ src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm | 512 ++++ .../intel-ipsec-mb/sse/sha224_one_block_sse.asm | 33 + src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm | 614 +++++ .../intel-ipsec-mb/sse/sha256_one_block_sse.asm | 512 ++++ .../intel-ipsec-mb/sse/sha384_one_block_sse.asm | 33 + .../intel-ipsec-mb/sse/sha512_one_block_sse.asm | 480 ++++ src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm | 449 ++++ src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm | 457 ++++ src/spdk/intel-ipsec-mb/sse/snow3g_sse.c | 42 + src/spdk/intel-ipsec-mb/sse/zuc_sse.asm | 1152 +++++++++ src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c | 554 +++++ 63 files changed, 23342 insertions(+) create mode 100644 src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/gcm_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/kasumi_sse.c create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c create mode 100644 src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/pon_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm create mode 100644 src/spdk/intel-ipsec-mb/sse/snow3g_sse.c create mode 100755 src/spdk/intel-ipsec-mb/sse/zuc_sse.asm create mode 100755 src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c (limited to 'src/spdk/intel-ipsec-mb/sse') diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm new file mode 100644 index 000000000..7c57688ff --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm @@ -0,0 +1,532 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_128_sse(void *in, +; UINT128 *IV, +; UINT128 keys[11], +; void *out, +; UINT64 len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "include/os.asm" + +%ifndef AES_CBC_DEC_128 +%define AES_CBC_DEC_128 aes_cbc_dec_128_sse +%endif + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY8 xmm8 +%define XKEY10 xmm9 +%define XIV xmm10 +%define XSAVED0 xmm11 +%define XSAVED1 xmm12 +%define XSAVED2 xmm13 +%define XSAVED3 xmm14 +%define XKEY xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(AES_CBC_DEC_128,function,internal) +AES_CBC_DEC_128: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY8 ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdeclast XDATA0, XKEY10 ; 10. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm new file mode 100644 index 000000000..72e19f482 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to compute CBC-MAC based on 128 bit CBC AES encryptionk code + +%define CBC_MAC +%include "sse/aes_cbc_enc_128_x4.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm new file mode 100644 index 000000000..11356afae --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm @@ -0,0 +1,545 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" + +; routine to do AES128 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +%ifndef AES_CNTR_128 +%define AES_CNTR_128 aes_cntr_128_sse +%define AES_CNTR_BIT_128 aes_cntr_bit_128_sse +%endif + +extern byteswap_const, set_byte15, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey3 xmm11 +%define xkey6 xmm12 +%define xkey9 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef CNTR_CCM_SSE +%ifdef LINUX +%define job rdi +%define p_in rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else ;; LINUX +%define job rcx +%define p_in rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen rax +%endif ;; LINUX +%define p_IV r11 +%else ;; CNTR_CCM_SSE +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else ;; LINUX +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif ;; LINUX +%endif ;; CNTR_CCM_SSE + +%define tmp r11 +%define flags r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + paddq xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey3, [p_keys + 3*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + movdqa xkeyB, [p_keys + 4*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey3 ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 4 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey6 ; key 6 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 8*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey9, [p_keys + 9*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey9 ; key 9 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + movdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + pslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + pandn xtmp2, CONCAT(xdata, idx) + por xtmp2, xtmp + movdqa CONCAT(xdata, idx), xtmp2 + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM) + +%ifidn %%CNTR_TYPE, CCM + mov p_in, [job + _src] + add p_in, [job + _cipher_start_src_offset_in_bytes] + mov p_ivlen, [job + _iv_len_in_bytes] + mov num_bytes, [job + _msg_len_to_cipher_in_bytes] + mov p_keys, [job + _aes_enc_key_expanded] + mov p_out, [job + _dst] + + movdqa xbyteswap, [rel byteswap_const] + ;; Prepare IV ;; + + ;; Byte 0: flags with L' + ;; Calculate L' = 15 - Nonce length - 1 = 14 - IV length + mov flags, 14 + sub flags, p_ivlen + movd xcounter, DWORD(flags) + ;; Bytes 1 - 13: Nonce (7 - 13 bytes long) + + ;; Bytes 1 - 7 are always copied (first 7 bytes) + mov p_IV, [job + _iv] + pinsrb xcounter, [p_IV], 1 + pinsrw xcounter, [p_IV + 1], 1 + pinsrd xcounter, [p_IV + 3], 1 + + cmp p_ivlen, 7 + je _finish_nonce_move + + cmp p_ivlen, 8 + je _iv_length_8 + cmp p_ivlen, 9 + je _iv_length_9 + cmp p_ivlen, 10 + je _iv_length_10 + cmp p_ivlen, 11 + je _iv_length_11 + cmp p_ivlen, 12 + je _iv_length_12 + + ;; Bytes 8 - 13 +_iv_length_13: + pinsrb xcounter, [p_IV + 12], 13 +_iv_length_12: + pinsrb xcounter, [p_IV + 11], 12 +_iv_length_11: + pinsrd xcounter, [p_IV + 7], 2 + jmp _finish_nonce_move +_iv_length_10: + pinsrb xcounter, [p_IV + 9], 10 +_iv_length_9: + pinsrb xcounter, [p_IV + 8], 9 +_iv_length_8: + pinsrb xcounter, [p_IV + 7], 8 + +_finish_nonce_move: + ; last byte = 1 + por xcounter, [rel set_byte15] +%else ;; CNTR/CNTR_BIT +%ifndef LINUX + mov num_bytes, [rsp + 8*5] ; arg5 +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + movdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + movdqu xcounter, [p_IV] +%endif +%endif ;; CNTR/CNTR_BIT/CCM +%%bswap_iv: + pshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + mov tmp, num_bytes + and tmp, 3*16 + jz %%chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE ; 1 block + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE ; 2 blocks + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE ; 3 blocks + add p_out, 3*16 + ; fall through to chk +%%chk: + and num_bytes, ~(3*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey3, [p_keys + 3*16] + movdqa xkey6, [p_keys + 6*16] + movdqa xkey9, [p_keys + 9*16] + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 4 blocks + partial bytes + do_aes_noload 4, %%CNTR_TYPE + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: +%ifidn %%CNTR_TYPE, CCM + mov rax, job + or dword [rax + _status], STS_COMPLETED_AES +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_sse_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 9 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + pxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_sse_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + pandn xtmp2, xdata0 + por xtmp2, xtmp + movdqa xdata0, xtmp2 +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +%ifdef CNTR_CCM_SSE +; JOB_AES_HMAC * aes_cntr_ccm_128_sse(JOB_AES_HMAC *job) +; arg 1 : job +MKGLOBAL(AES_CNTR_CCM_128,function,internal) +AES_CNTR_CCM_128: + DO_CNTR CCM +%else +;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +MKGLOBAL(AES_CNTR_128,function,internal) +AES_CNTR_128: + DO_CNTR CNTR + +;; aes_cntr_bit_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len) +MKGLOBAL(AES_CNTR_BIT_128,function,internal) +AES_CNTR_BIT_128: + DO_CNTR CNTR_BIT +%endif ;; CNTR_CCM_SSE + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm new file mode 100644 index 000000000..8c54715ee --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_ccm_by4_sse.asm @@ -0,0 +1,32 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define CNTR_CCM_SSE +%ifndef AES_CNTR_CCM_128 +%define AES_CNTR_CCM_128 aes_cntr_ccm_128_sse +%endif +%include "sse/aes128_cntr_by4_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm new file mode 100644 index 000000000..144de4f70 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm @@ -0,0 +1,590 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_192_sse(void *in, +; UINT128 *IV, +; UINT128 keys[13], // +1 over key length +; void *out, +; UINT64 len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "include/os.asm" + + +%ifndef AES_CBC_DEC_192 +%define AES_CBC_DEC_192 aes_cbc_dec_192_sse +%endif + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XIV xmm9 +%define XSAVED0 xmm10 +%define XSAVED1 xmm11 +%define XSAVED2 xmm12 +%define XSAVED3 xmm13 +%define XKEY_A xmm14 +%define XKEY_B xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(AES_CBC_DEC_192,function,internal) +AES_CBC_DEC_192: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + aesdec XDATA1, [KEYS + 11*16] + + aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC + aesdeclast XDATA1, [KEYS + 12*16] + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY_B ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdec XDATA0, XKEY10 ; 10. DEC + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + + aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdeclast XDATA0, XKEY_B ; 12. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + + + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + + + aesdeclast XDATA0, XKEY_B ; 12. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 12. DECLAST + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm new file mode 100644 index 000000000..eaa89f21e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm @@ -0,0 +1,470 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" + +; routine to do AES192 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +%ifndef AES_CNTR_192 +%define AES_CNTR_192 aes_cntr_192_sse +%define AES_CNTR_BIT_192 aes_cntr_bit_192_sse +%endif + +extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + paddq xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + movdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + movdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + pslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + pandn xtmp2, CONCAT(xdata, idx) + por xtmp2, xtmp + movdqa CONCAT(xdata, idx), xtmp2 + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + movdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + movdqu xcounter, [p_IV] +%endif + +%%bswap_iv: + pshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + mov tmp, num_bytes + and tmp, 3*16 + jz %%chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE + add p_out, 3*16 + ; fall through to chk +%%chk: + and num_bytes, ~(3*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey4, [p_keys + 4*16] + movdqa xkey8, [p_keys + 8*16] + movdqa xkey12, [p_keys + 12*16] + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 4 blocks + partial bytes + do_aes_noload 4, %%CNTR_TYPE + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: + +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_sse_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 11 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + pxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_sse_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + pandn xtmp2, xdata0 + por xtmp2, xtmp + movdqa xdata0, xtmp2 +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +;; aes_cntr_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +MKGLOBAL(AES_CNTR_192,function,internal) +AES_CNTR_192: + DO_CNTR CNTR + +;; aes_cntr_bit_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len) +MKGLOBAL(AES_CNTR_BIT_192,function,internal) +AES_CNTR_BIT_192: + DO_CNTR CNTR_BIT + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm new file mode 100644 index 000000000..c82a4f58a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm @@ -0,0 +1,634 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_256_sse(void *in, +; UINT128 *IV, +; UINT128 keys[15], +; void *out, +; UINT64 len_bytes); +; +; arg 1: rcx: pointer to input (cipher text) +; arg 2: rdx: pointer to IV +; arg 3: r8: pointer to keys +; arg 4: r9: pointer to output (plain text) +; arg 5: sp: length in bytes (multiple of 16) +; + +%include "include/os.asm" + +%ifndef AES_CBC_DEC_256 +%define AES_CBC_DEC_256 aes_cbc_dec_256_sse +%endif + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XIV xmm9 +%define XSAVED0 xmm10 +%define XSAVED1 xmm11 +%define XSAVED2 xmm12 +%define XSAVED3 xmm13 +%define XKEY_A xmm14 +%define XKEY_B xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(AES_CBC_DEC_256,function,internal) +AES_CBC_DEC_256: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + aesdec XDATA1, [KEYS + 11*16] + + aesdec XDATA0, [KEYS + 12*16] ; 12. DEC + aesdec XDATA1, [KEYS + 12*16] + + aesdec XDATA0, [KEYS + 13*16] ; 13. DEC + aesdec XDATA1, [KEYS + 13*16] + + aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC + aesdeclast XDATA1, [KEYS + 14*16] + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY_B ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdec XDATA0, XKEY10 ; 10. DEC + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + + aesdec XDATA0, [KEYS + 12*16] ; 12. DEC + + aesdec XDATA0, [KEYS + 13*16] ; 13. DEC + + aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm new file mode 100644 index 000000000..6d8f211f7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm @@ -0,0 +1,483 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +; routine to do AES256 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +%ifndef AES_CNTR_256 +%define AES_CNTR_256 aes_cntr_256_sse +%define AES_CNTR_BIT_256 aes_cntr_bit_256_sse +%endif + +extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + paddq xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + movdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 13*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 14*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 13 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkeyB ; key 14 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + movdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + pslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + pandn xtmp2, CONCAT(xdata, idx) + por xtmp2, xtmp + movdqa CONCAT(xdata, idx), xtmp2 + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + movdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + movdqu xcounter, [p_IV] +%endif + +%%bswap_iv: + pshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + mov tmp, num_bytes + and tmp, 3*16 + jz %%chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE + add p_out, 3*16 + ; fall through to chk +%%chk: + and num_bytes, ~(3*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey4, [p_keys + 4*16] + movdqa xkey8, [p_keys + 8*16] + movdqa xkey12, [p_keys + 12*16] + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 4 blocks + partial bytes + do_aes_noload 4, %%CNTR_TYPE + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: + +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_sse_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 13 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + pxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_sse_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + movq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + pand xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + pandn xtmp2, xdata0 + por xtmp2, xtmp + movdqa xdata0, xtmp2 +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_sse_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +MKGLOBAL(AES_CNTR_256,function,internal) +AES_CNTR_256: + DO_CNTR CNTR + +;; aes_cntr_bit_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bits, UINT64 iv_len) +MKGLOBAL(AES_CNTR_BIT_256,function,internal) +AES_CNTR_BIT_256: + DO_CNTR CNTR_BIT + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm new file mode 100644 index 000000000..4b07ecf90 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm @@ -0,0 +1,380 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to do a 128 bit CBC AES encryption / CBC-MAC digest computation +;;; processes 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_128_x4(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 8 +endstruc + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi ;r8 +%define arg4 rsi ;r9 +%endif + +%define ARG arg1 +%define LEN arg2 + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx + +%define IN1 r10 +%define KEYS1 arg3 + +%define IN2 r12 +%define KEYS2 arg4 + +%define IN3 r14 +%define KEYS3 rbp + +%ifndef CBC_MAC +;; No cipher text write back for CBC-MAC +%define OUT0 r9 +%define OUT1 r11 +%define OUT2 r13 +%define OUT3 r15 +%endif + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +%ifndef AES_CBC_ENC_X4 + +%ifdef CBC_MAC +MKGLOBAL(aes128_cbc_mac_x4,function,internal) +aes128_cbc_mac_x4: +%else +MKGLOBAL(aes_cbc_enc_128_x4,function,internal) +aes_cbc_enc_128_x4: +%endif + +%else ;; AES_CBC_ENC_X4 already defined + +%ifdef CBC_MAC +MKGLOBAL(aes128_cbc_mac_x4_no_aesni,function,internal) +aes128_cbc_mac_x4_no_aesni: +%else +MKGLOBAL(aes_cbc_enc_128_x4_no_aesni,function,internal) +aes_cbc_enc_128_x4_no_aesni: +%endif + +%endif + sub rsp, STACK_size + mov [rsp + _gpr_save + 8*0], rbp +%ifdef CBC_MAC + mov [rsp + _gpr_save + 8*1], rbx + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif +%endif + mov IDX, 16 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + +%ifndef CBC_MAC + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] +%endif + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext +%endif + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + ;; No cipher text write back for CBC-MAC + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertext + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertext + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertext +%endif + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV / store digest for CBC-MAC + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + +%ifndef CBC_MAC + ;; No OUT pointer updates for CBC-MAC + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 +%endif + +%ifdef CBC_MAC + mov rbx, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif +%endif + mov rbp, [rsp + _gpr_save + 8*0] + add rsp, STACK_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm new file mode 100644 index 000000000..c9f1cc3c5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm @@ -0,0 +1,349 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 192 bit CBC AES encrypt +;;; process 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_192_x4(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +%ifndef AES_CBC_ENC_X4 +%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 +%endif + +section .text + +MKGLOBAL(AES_CBC_ENC_X4,function,internal) +AES_CBC_ENC_X4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + + + + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex + + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm new file mode 100644 index 000000000..e51f4caac --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm @@ -0,0 +1,368 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 256 bit CBC AES encrypt +;;; process 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_256_x4(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +%ifndef AES_CBC_ENC_X4 +%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 +%endif + +section .text + +MKGLOBAL(AES_CBC_ENC_X4,function,internal) +AES_CBC_ENC_X4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + + aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + + aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + + aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + + aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + + + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex + + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm new file mode 100644 index 000000000..1ee400bb4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm @@ -0,0 +1,167 @@ +;; +;; Copyright (c) 2017-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/clear_regs.asm" + +;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. +;;; It processes only one buffer at a time. +;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX R9 R10 R11 +;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX R9 R10 +;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 +;; + +%ifndef AES_CFB_128_ONE +%define AES_CFB_128_ONE aes_cfb_128_one_sse +%endif + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define arg5 r8 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define arg5 [rsp + 5*8] +%endif + +%define OUT arg1 +%define IN arg2 +%define IV arg3 +%define KEYS arg4 +%ifdef LINUX +%define LEN arg5 +%else +%define LEN2 arg5 +%define LEN r11 +%endif + +%define TMP0 rax +%define TMP1 r10 + +%define XDATA xmm0 +%define XIN xmm1 + +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys, uint64_t len) +;; arg 1: OUT : addr to put clear/cipher text out +;; arg 2: IN : addr to take cipher/clear text from +;; arg 3: IV : initialization vector +;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) +;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) +;; +;; AES CFB128 one block encrypt/decrypt implementation. +;; The function doesn't update IV. The result of operation can be found in OUT. +;; +;; It is primarly designed to process partial block of +;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) +;; +;; It process up to one block only (up to 16 bytes). +;; +;; It makes sure not to read more than LEN bytes from IN and +;; not to store more than LEN bytes to OUT. + +MKGLOBAL(AES_CFB_128_ONE,function,) +align 32 +AES_CFB_128_ONE: +%ifndef LINUX + mov LEN, LEN2 +%endif +%ifdef SAFE_PARAM + cmp IV, 0 + jz exit_cfb + + cmp KEYS, 0 + jz exit_cfb + + cmp LEN, 0 + jz skip_in_out_check + + cmp OUT, 0 + jz exit_cfb + + cmp IN, 0 + jz exit_cfb + +skip_in_out_check: +%endif + + simd_load_sse_16 XIN, IN, LEN + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu XDATA, [IV] ; IV (or next to last block) + pxor XDATA, [KEYS + 16*0] ; 0. ARK + aesenc XDATA, [KEYS + 16*1] ; 1. ENC + aesenc XDATA, [KEYS + 16*2] ; 2. ENC + aesenc XDATA, [KEYS + 16*3] ; 3. ENC + aesenc XDATA, [KEYS + 16*4] ; 4. ENC + aesenc XDATA, [KEYS + 16*5] ; 5. ENC + aesenc XDATA, [KEYS + 16*6] ; 6. ENC + aesenc XDATA, [KEYS + 16*7] ; 7. ENC + aesenc XDATA, [KEYS + 16*8] ; 8. ENC + aesenc XDATA, [KEYS + 16*9] ; 9. ENC + aesenclast XDATA, [KEYS + 16*10] ; 10. ENC + + pxor XDATA, XIN ; plaintext/ciphertext XOR block cipher encryption + + simd_store_sse OUT, XDATA, LEN, TMP0, TMP1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifdef SAFE_DATA + ;; XDATA and XIN are the only scratch SIMD registers used + clear_xmms_sse XDATA, XIN + clear_scratch_gps_asm +%endif +exit_cfb: + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm new file mode 100644 index 000000000..c4b767932 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm @@ -0,0 +1,654 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_ecb_x_y_sse(void *in, +; UINT128 keys[], +; void *out, +; UINT64 len_bytes); +; +; x = direction (enc/dec) +; y = key size (128/192/256) +; arg 1: IN: pointer to input (cipher text) +; arg 2: KEYS: pointer to keys +; arg 3: OUT: pointer to output (plain text) +; arg 4: LEN: length in bytes (multiple of 16) +; + +%include "include/os.asm" + +%ifndef AES_ECB_ENC_128 +%define AES_ECB_ENC_128 aes_ecb_enc_128_sse +%define AES_ECB_ENC_192 aes_ecb_enc_192_sse +%define AES_ECB_ENC_256 aes_ecb_enc_256_sse +%define AES_ECB_DEC_128 aes_ecb_dec_128_sse +%define AES_ECB_DEC_192 aes_ecb_dec_192_sse +%define AES_ECB_DEC_256 aes_ecb_dec_256_sse +%endif + +%ifdef LINUX +%define IN rdi +%define KEYS rsi +%define OUT rdx +%define LEN rcx +%else +%define IN rcx +%define KEYS rdx +%define OUT r8 +%define LEN r9 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XKEY_A xmm14 +%define XKEY_B xmm15 + +section .text + +%macro AES_ECB 2 +%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value +%define %%DIR %2 ; [in] Direction (encrypt/decrypt) + +%ifidn %%DIR, ENC +%define AES aesenc +%define AES_LAST aesenclast +%else ; DIR = DEC +%define AES aesdec +%define AES_LAST aesdeclast +%endif + mov TMP, LEN + and TMP, 3*16 + jz %%initial_4 + cmp TMP, 2*16 + jb %%initial_1 + ja %%initial_3 + +%%initial_2: + ; load plain/cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, [KEYS + 1*16] ; 1. ENC + AES XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, [KEYS + 3*16] ; 3. ENC + AES XDATA1, [KEYS + 3*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, [KEYS + 5*16] ; 5. ENC + AES XDATA1, [KEYS + 5*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, [KEYS + 7*16] ; 7. ENC + AES XDATA1, [KEYS + 7*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + AES XDATA0, [KEYS + 9*16] ; 9. ENC + AES XDATA1, [KEYS + 9*16] + +%if %%NROUNDS >= 12 + AES XDATA0, XKEY10 ; 10. ENC + AES XDATA1, XKEY10 + + AES XDATA0, [KEYS + 11*16] ; 11. ENC + AES XDATA1, [KEYS + 11*16] +%endif + +%if %%NROUNDS == 14 + AES XDATA0, [KEYS + 12*16] ; 12. ENC + AES XDATA1, [KEYS + 12*16] + + AES XDATA0, [KEYS + 13*16] ; 13. ENC + AES XDATA1, [KEYS + 13*16] +%endif + +%if %%NROUNDS == 10 + AES_LAST XDATA0, XKEY10 ; 10. ENC + AES_LAST XDATA1, XKEY10 +%elif %%NROUNDS == 12 + AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC + AES_LAST XDATA1, [KEYS + 12*16] +%else + AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC + AES_LAST XDATA1, [KEYS + 14*16] +%endif + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je %%done + jmp %%main_loop + + + align 16 +%%initial_1: + ; load plain/cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, [KEYS + 1*16] ; 1. ENC + + mov IDX, 1*16 + + AES XDATA0, XKEY2 ; 2. ENC + + movdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, [KEYS + 3*16] ; 3. ENC + + AES XDATA0, XKEY4 ; 4. ENC + + movdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, [KEYS + 5*16] ; 5. ENC + + AES XDATA0, XKEY6 ; 6. ENC + + movdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, [KEYS + 7*16] ; 7. ENC + + AES XDATA0, XKEY_B ; 8. ENC + + movdqa XKEY10, [KEYS + 10*16] + + AES XDATA0, [KEYS + 9*16] ; 9. ENC + +%if %%NROUNDS >= 12 + AES XDATA0, XKEY10 ; 10. ENC + + AES XDATA0, [KEYS + 11*16] ; 11. ENC +%endif + +%if %%NROUNDS == 14 + AES XDATA0, [KEYS + 12*16] ; 12. ENC + + AES XDATA0, [KEYS + 13*16] ; 13. ENC +%endif + +%if %%NROUNDS == 10 + + AES_LAST XDATA0, XKEY10 ; 10. ENC +%elif %%NROUNDS == 12 + AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC +%else + AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC +%endif + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je %%done + jmp %%main_loop + + +%%initial_3: + ; load plain/cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + movdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + +%if %%NROUNDS >= 12 + movdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + movdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + +%endif + +%if %%NROUNDS == 14 + movdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je %%done + jmp %%main_loop + + + align 16 +%%initial_4: + ; load plain/cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + AES XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + AES XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + AES XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + +%if %%NROUNDS >= 12 + movdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + +%if %%NROUNDS == 14 + movdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + AES_LAST XDATA3, XKEY_B + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz %%done + jmp %%main_loop + + align 16 +%%main_loop: + ; load plain/cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + AES XDATA3, XKEY2 + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + AES XDATA3, XKEY4 + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + AES XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + +%if %%NROUNDS >= 12 + movdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + +%if %%NROUNDS == 14 + movdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + AES_LAST XDATA3, XKEY_B + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + cmp IDX, LEN + jne %%main_loop + +%%done: + + ret + +%endmacro + +align 16 +MKGLOBAL(AES_ECB_ENC_128,function,internal) +AES_ECB_ENC_128: + + AES_ECB 10, ENC + +align 16 +MKGLOBAL(AES_ECB_ENC_192,function,internal) +AES_ECB_ENC_192: + + AES_ECB 12, ENC + +align 16 +MKGLOBAL(AES_ECB_ENC_256,function,internal) +AES_ECB_ENC_256: + + AES_ECB 14, ENC + +align 16 +MKGLOBAL(AES_ECB_DEC_128,function,internal) +AES_ECB_DEC_128: + + AES_ECB 10, DEC + +align 16 +MKGLOBAL(AES_ECB_DEC_192,function,internal) +AES_ECB_DEC_192: + + AES_ECB 12, DEC + +align 16 +MKGLOBAL(AES_ECB_DEC_256,function,internal) +AES_ECB_DEC_256: + + AES_ECB 14, DEC + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm new file mode 100644 index 000000000..afbb38512 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm @@ -0,0 +1,303 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do 128 bit AES XCBC +;;; process 4 buffers at a time, single data structure as input +;;; Updates In pointer at end + +;; clobbers all registers except for ARG1 and rbp + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + + +%ifndef AES_XCBC_X4 +%define AES_XCBC_X4 aes_xcbc_mac_128_x4 +%endif + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_XCBC_ARGS_x8 { +;; void* in[8]; +;; UINT128* keys[8]; +;; UINT128 ICV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +MKGLOBAL(AES_XCBC_X4,function,internal) +AES_XCBC_X4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesxcbcarg_in + 8*0] + mov IN1, [ARG + _aesxcbcarg_in + 8*1] + mov IN2, [ARG + _aesxcbcarg_in + 8*2] + mov IN3, [ARG + _aesxcbcarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] + mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] + mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] + mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV + pxor XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV + pxor XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV + pxor XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR ICV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR ICV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR ICV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR ICV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update ICV + movdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 + movdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 + movdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 + movdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 + + ;; update IN + add IN0, LEN + mov [ARG + _aesxcbcarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesxcbcarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesxcbcarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesxcbcarg_in + 8*3], IN3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm new file mode 100644 index 000000000..b8d3ea963 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm @@ -0,0 +1,30 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM128_MODE 1 +%include "sse/gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm new file mode 100644 index 000000000..68e995a06 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2017-2018, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM192_MODE 1 +%include "sse/gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm new file mode 100644 index 000000000..3898411a1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "sse/gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm new file mode 100644 index 000000000..d053da51f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm @@ -0,0 +1,2586 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "include/os.asm" +%include "include/reg_sizes.asm" +%include "include/clear_regs.asm" +%include "include/gcm_defines.asm" +%include "include/gcm_keys_sse_avx.asm" +%include "include/memcpy.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_sse.asm!" +%endif +%endif +%endif + +%ifdef NO_AESNI +%define SSE sse_no_aesni +%else +%define SSE sse +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ SSE +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ SSE +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ SSE +%define NROUNDS 13 +%endif + +default rel +; need to push 4 registers into stack to maintain +%define STACK_OFFSET 8*4 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqa %%T1, %%GH + pshufd %%T2, %%GH, 01001110b + pshufd %%T3, %%HK, 01001110b + pxor %%T2, %%GH ; %%T2 = (a1+a0) + pxor %%T3, %%HK ; %%T3 = (b1+b0) + + pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T2, %%GH + pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%GH, %%T3 + pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK + + + ;first phase of the reduction + movdqa %%T2, %%GH + movdqa %%T3, %%GH + movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%GH + movdqa %%T4,%%GH + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T5 + pxor %%GH, %%T2 + pxor %%GH, %%T1 ; the result is in %%T1 + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + movdqa %%T4, %%HK + pshufd %%T1, %%HK, 01001110b + pxor %%T1, %%HK + movdqu [%%GDATA + HashKey_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly + movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly + movdqu [%%GDATA + HashKey_3], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_3_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly + movdqu [%%GDATA + HashKey_4], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly + movdqu [%%GDATA + HashKey_5], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_5_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly + movdqu [%%GDATA + HashKey_6], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly + movdqu [%%GDATA + HashKey_7], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly + movdqu [%%GDATA + HashKey_8], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_8_k], %%T1 + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + pxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + pinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + pinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 15 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%GDATA_KEY %4 +%define %%XTMP0 %5 ; xmm temp reg 5 +%define %%XTMP1 %6 ; xmm temp reg 5 +%define %%XTMP2 %7 +%define %%XTMP3 %8 +%define %%XTMP4 %9 +%define %%XTMP5 %10 ; xmm temp reg 5 +%define %%T1 %11 ; temp reg 1 +%define %%T2 %12 +%define %%T3 %13 +%define %%T4 %14 +%define %%T5 %15 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + pxor %%AAD_HASH, %%AAD_HASH + +%%_get_AAD_loop128: + cmp %%T2, 128 + jl %%_exit_AAD_loop128 + + movdqu %%XTMP0, [%%T1 + 16*0] + pshufb %%XTMP0, [rel SHUF_MASK] + + pxor %%XTMP0, %%AAD_HASH + + movdqu %%XTMP5, [%%GDATA_KEY + HashKey_8] + movdqa %%XTMP1, %%XTMP0 + movdqa %%XTMP2, %%XTMP0 + movdqa %%XTMP3, %%XTMP0 + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0 + pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0 + pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1 + pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 + +%assign i 1 +%assign j 7 +%rep 7 + movdqu %%XTMP0, [%%T1 + 16*i] + pshufb %%XTMP0, [rel SHUF_MASK] + + movdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j] + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 + pxor %%XTMP1, %%XTMP4 + + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 + pxor %%XTMP2, %%XTMP4 + + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 + pxor %%XTMP3, %%XTMP4 + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x10 + pxor %%XTMP3, %%XTMP4 +%assign i (i + 1) +%assign j (j - 1) +%endrep + + movdqa %%XTMP4, %%XTMP3 + pslldq %%XTMP4, 8 ; shift-L 2 DWs + psrldq %%XTMP3, 8 ; shift-R 2 DWs + pxor %%XTMP2, %%XTMP4 + pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + movdqa %%XTMP5, [rel POLY2] + movdqa %%XTMP0, %%XTMP5 + pclmulqdq %%XTMP0, %%XTMP2, 0x01 + pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs + pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + movdqa %%XTMP3, %%XTMP5 + pclmulqdq %%XTMP3, %%XTMP2, 0x00 + psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + movdqa %%XTMP4, %%XTMP5 + pclmulqdq %%XTMP4, %%XTMP2, 0x10 + pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) + + pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa %%AAD_HASH, %%XTMP1 + pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1 + + sub %%T2, 128 + je %%_CALC_AAD_done + + add %%T1, 128 + jmp %%_get_AAD_loop128 + +%%_exit_AAD_loop128: + cmp %%T2, 16 + jl %%_get_small_AAD_block + + ;; calculate hash_key position to start with + mov %%T3, %%T2 + and %%T3, -16 ; 1 to 7 blocks possible here + neg %%T3 + add %%T3, HashKey_1 + 16 + lea %%T3, [%%GDATA_KEY + %%T3] + + movdqu %%XTMP0, [%%T1] + pshufb %%XTMP0, [rel SHUF_MASK] + + pxor %%XTMP0, %%AAD_HASH + + movdqu %%XTMP5, [%%T3] + movdqa %%XTMP1, %%XTMP0 + movdqa %%XTMP2, %%XTMP0 + movdqa %%XTMP3, %%XTMP0 + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP1, %%XTMP5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%XTMP2, %%XTMP5, 0x00 ; %%T2 = a0*b0 + pclmulqdq %%XTMP3, %%XTMP5, 0x01 ; %%T3 = a1*b0 + pclmulqdq %%XTMP4, %%XTMP5, 0x10 ; %%T4 = a0*b1 + pxor %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 + + add %%T3, 16 ; move to next hashkey + add %%T1, 16 ; move to next data block + sub %%T2, 16 + cmp %%T2, 16 + jl %%_AAD_reduce + +%%_AAD_blocks: + movdqu %%XTMP0, [%%T1] + pshufb %%XTMP0, [rel SHUF_MASK] + + movdqu %%XTMP5, [%%T3] + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 + pxor %%XTMP1, %%XTMP4 + + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 + pxor %%XTMP2, %%XTMP4 + + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 + pxor %%XTMP3, %%XTMP4 + movdqa %%XTMP4, %%XTMP0 + pclmulqdq %%XTMP4, %%XTMP5, 0x10 + pxor %%XTMP3, %%XTMP4 + + add %%T3, 16 ; move to next hashkey + add %%T1, 16 + sub %%T2, 16 + cmp %%T2, 16 + jl %%_AAD_reduce + jmp %%_AAD_blocks + +%%_AAD_reduce: + movdqa %%XTMP4, %%XTMP3 + pslldq %%XTMP4, 8 ; shift-L 2 DWs + psrldq %%XTMP3, 8 ; shift-R 2 DWs + pxor %%XTMP2, %%XTMP4 + pxor %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + movdqa %%XTMP5, [rel POLY2] + movdqa %%XTMP0, %%XTMP5 + pclmulqdq %%XTMP0, %%XTMP2, 0x01 + pslldq %%XTMP0, 8 ; shift-L xmm2 2 DWs + pxor %%XTMP2, %%XTMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + movdqa %%XTMP3, %%XTMP5 + pclmulqdq %%XTMP3, %%XTMP2, 0x00 + psrldq %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + movdqa %%XTMP4, %%XTMP5 + pclmulqdq %%XTMP4, %%XTMP2, 0x10 + pslldq %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) + + pxor %%XTMP4, %%XTMP3 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa %%AAD_HASH, %%XTMP1 + pxor %%AAD_HASH, %%XTMP4 ; the result is in %%T1 + + or %%T2, %%T2 + je %%_CALC_AAD_done + +%%_get_small_AAD_block: + movdqu %%XTMP0, [%%GDATA_KEY + HashKey] + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + pshufb %%XTMP1, [rel SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + mov r13, [%%GDATA_CTX + PBlockLen] + +%%_data_read: ;Finished reading in data + + + movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key + movdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + movdqa xmm3, xmm1 + pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pand xmm3, xmm1 + pshufb xmm3, [SHUF_MASK] + pshufb xmm3, xmm2 + pxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_dec_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pshufb xmm9, [SHUF_MASK] + pshufb xmm9, xmm2 + pxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_encode_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + pshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + + ; start AES for %%num_initial_blocks blocks + movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + paddd %%CTR, [ONE] ; INCR Y0 + movdqa reg(i), %%CTR + pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +movdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + pxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + pxor reg(i), %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + movdqa reg(i), %%T1 + %endif + pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + pxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + movdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and store cipher/plain text. +; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8) +; - combine current GHASH value into block 0 (XMM1) + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM1, %%CTR + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM2, %%CTR + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM3, %%CTR + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM4, %%CTR + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM5, %%CTR + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM6, %%CTR + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM7, %%CTR + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM8, %%CTR + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + movdqu %%T_key, [%%GDATA_KEY+16*0] + pxor %%XMM1, %%T_key + pxor %%XMM2, %%T_key + pxor %%XMM3, %%T_key + pxor %%XMM4, %%T_key + pxor %%XMM5, %%T_key + pxor %%XMM6, %%T_key + pxor %%XMM7, %%T_key + pxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS ; do early (13) rounds (11 for GCM192) + movdqu %%T_key, [%%GDATA_KEY+16*i] + aesenc %%XMM1, %%T_key + aesenc %%XMM2, %%T_key + aesenc %%XMM3, %%T_key + aesenc %%XMM4, %%T_key + aesenc %%XMM5, %%T_key + aesenc %%XMM6, %%T_key + aesenc %%XMM7, %%T_key + aesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round + aesenclast %%XMM1, %%T_key + aesenclast %%XMM2, %%T_key + aesenclast %%XMM3, %%T_key + aesenclast %%XMM4, %%T_key + aesenclast %%XMM5, %%T_key + aesenclast %%XMM6, %%T_key + aesenclast %%XMM7, %%T_key + aesenclast %%XMM8, %%T_key + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + pxor %%XMM1, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM1, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + pxor %%XMM2, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM2, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + pxor %%XMM3, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM3, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + pxor %%XMM4, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM4, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + pxor %%XMM5, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM5, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + pxor %%XMM6, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM6, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + pxor %%XMM7, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM7, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + pxor %%XMM8, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + movdqa %%T7, %%XMM1 + movdqu [rsp + TMP2], %%XMM2 + movdqu [rsp + TMP3], %%XMM3 + movdqu [rsp + TMP4], %%XMM4 + movdqu [rsp + TMP5], %%XMM5 + movdqu [rsp + TMP6], %%XMM6 + movdqu [rsp + TMP7], %%XMM7 + movdqu [rsp + TMP8], %%XMM8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + + movdqa %%T4, %%T7 + pshufd %%T6, %%T7, 01001110b + pxor %%T6, %%T7 + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + %else + paddd %%CTR, [ONEf] ; INCR CNT + %endif + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_8_k] + pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + movdqa %%XMM1, %%CTR + + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM8, %%CTR + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + %else + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM8, %%CTR + %endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu %%T1, [%%GDATA + 16*0] + pxor %%XMM1, %%T1 + pxor %%XMM2, %%T1 + pxor %%XMM3, %%T1 + pxor %%XMM4, %%T1 + pxor %%XMM5, %%T1 + pxor %%XMM6, %%T1 + pxor %%XMM7, %%T1 + pxor %%XMM8, %%T1 + + ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP2] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*1] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + movdqu %%T1, [%%GDATA + 16*2] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqu %%T1, [rsp + TMP3] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*3] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP4] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*4] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*5] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP5] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + + movdqu %%T1, [%%GDATA + 16*6] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + movdqu %%T1, [rsp + TMP6] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*7] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP7] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*8] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP8] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T7, %%T3 + pxor %%T4, %%T1 + + movdqu %%T1, [%%GDATA + 16*9] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + +%ifdef GCM128_MODE + movdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*12] ; finish last key round +%endif +%ifdef GCM256_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*12] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*13] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*14] ; finish last key round +%endif + +%assign i 0 +%assign j 1 +%rep 8 + XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + +%ifidn %%ENC_DEC, DEC + movdqa %%T3, %%T1 +%endif + + pxor %%T1, %%T5 + aesenclast reg(j), %%T1 ; XMM1:XMM8 + XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer + +%ifidn %%ENC_DEC, DEC + movdqa reg(j), %%T3 +%endif +%assign i (i+1) +%assign j (j+1) +%endrep + + + + + pxor %%T2, %%T6 + pxor %%T2, %%T4 + pxor %%T2, %%T7 + + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T3 + pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 + + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T1, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T1 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T1,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T1,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T1 + + pxor %%T2, %%T5 + pxor %%T7, %%T2 + pxor %%T7, %%T4 ; the result is in %%T4 + + + pxor %%XMM1, %%T7 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ; Karatsuba Method + movdqa %%T6, %%XMM1 + pshufd %%T2, %%XMM1, 01001110b + pxor %%T2, %%XMM1 + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 + + pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_8_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + movdqa %%T7, %%XMM1 + movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM2 + pshufd %%T2, %%XMM2, 01001110b + pxor %%T2, %%XMM2 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM2 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM3 + pshufd %%T2, %%XMM3, 01001110b + pxor %%T2, %%XMM3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM3 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM4 + pshufd %%T2, %%XMM4, 01001110b + pxor %%T2, %%XMM4 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM4 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM5 + pshufd %%T2, %%XMM5, 01001110b + pxor %%T2, %%XMM5 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM5 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM6 + pshufd %%T2, %%XMM6, 01001110b + pxor %%T2, %%XMM6 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM6 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM7 + pshufd %%T2, %%XMM7, 01001110b + pxor %%T2, %%XMM7 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM7 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM8 + pshufd %%T2, %%XMM8, 01001110b + pxor %%T2, %%XMM8 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM8 + pxor %%T2, %%XMM1 + pxor %%T2, %%T6 + pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm + + + movdqa %%T4, %%T2 + pslldq %%T4, 8 ; shift-L %%T4 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T4 + pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T1, %%T2 + psrldq %%T1, 4 ; shift-R %%T1 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T4,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T1 + pxor %%T7, %%T2 + pxor %%T6, %%T7 ; the result is in %%T6 + +%endmacro + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 3 +%define %%GDATA %1 +%define %%ST %2 +%define %%T1 %3 + movdqu %%T1, [%%GDATA+16*0] + pxor %%ST, %%T1 +%assign i 1 +%rep NROUNDS + movdqu %%T1, [%%GDATA+16*i] + aesenc %%ST, %%T1 +%assign i (i+1) +%endrep + movdqu %%T1, [%%GDATA+16*i] + aesenclast %%ST, %%T1 +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GCM_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_sse_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GCM_ENC + mov rsp, r14 + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13 and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + pxor xmm2, xmm3 + mov r10, %%A_LEN + + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + pinsrq xmm2, [r10], 0 + pinsrd xmm2, [r10+8], 2 + movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + pshufb xmm2, [SHUF_MASK] + + movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data +; struct has been initialized by GCM_INIT. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET +%ifidn __OUTPUT_FORMAT__, win64 + mov r12, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], r12 ;Update length of data processed +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed +%endif + movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + movdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext + sub r13, %%DATA_OFFSET + mov r10, r13 ;save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + movd r15d, xmm9 + and r15d, 255 + pshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + pshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + pshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + movdqu [%%GDATA_CTX + AadHash], xmm14 + movdqu [%%GDATA_CTX + CurCount], xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + paddd xmm9, [ONE] ; INCR CNT to get Yn + movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 + pshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) + movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: + %ifidn %%ENC_DEC, DEC + movdqa xmm2, xmm1 + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pand xmm2, xmm1 + pshufb xmm2, [SHUF_MASK] + pxor xmm14, xmm2 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + %else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pshufb xmm9, [SHUF_MASK] + pxor xmm14, xmm9 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + %endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) + movdqu xmm14, [%%GDATA_CTX + AadHash] + movdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + movdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + movd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + movq xmm1, %%PLAIN_CYPH_LEN + pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + pxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) + + pxor xmm9, xmm14 + + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + + cmp r11, 8 + je %%_T_8 + + simd_store_sse r10, xmm9, r11, r12, rax + jmp %%_return_T_done +%%_T_8: + movq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + movq rax, xmm9 + mov [r10], rax + psrldq xmm9, 8 + movd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done +%%_T_16: + movdqu [r10], xmm9 + +%%_return_T_done: + +%ifdef SAFE_DATA + ;; Clear sensitive data from context structure + pxor xmm0, xmm0 + movdqu [%%GDATA_CTX + AadHash], xmm0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm0 +%endif + +%endmacro ;GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_precomp +%endif + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + pxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey + + pshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + movdqa xmm2, xmm6 + psllq xmm6, 1 + psrlq xmm2, 63 + movdqa xmm1, xmm2 + pslldq xmm2, 8 + psrldq xmm1, 8 + por xmm6, xmm2 + ;reduction + pshufd xmm2, xmm1, 00100100b + pcmpeqd xmm2, [TWOONE] + pand xmm2, [POLY] + pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_sse_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + +exit_precomp: + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + mov r14, rsp + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 1*16 + movdqu [rsp + 0*16], xmm6 +%endif + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_init + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_init + + ;; Check IV != NULL + cmp arg3, 0 + jz exit_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + cmp arg4, 0 + jz exit_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_sse_asm +%endif +exit_init: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + mov rsp, r14 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_enc + +skip_in_out_check_update_enc: +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + +exit_update_enc: + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_dec + +skip_in_out_check_update_dec: +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + +exit_update_dec: + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_enc_fin + + cmp arg4, 16 + ja exit_enc_fin +%endif + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_sse_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + +exit_enc_fin: + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_dec_fin + + cmp arg4, 16 + ja exit_dec_fin +%endif + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_sse_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + +exit_dec_fin: + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_enc + + cmp arg10, 16 + ja exit_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + +exit_enc: + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_dec + + cmp arg10, 16 + ja exit_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_dec + +skip_aad_check_dec: +%endif + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + +exit_dec: + FUNC_RESTORE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c b/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c new file mode 100644 index 000000000..b1ef71a8a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/kasumi_sse.c @@ -0,0 +1,385 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include + +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse + +#include "include/kasumi_internal.h" +#include "include/save_xmms.h" +#include "include/clear_regs_mem.h" + +#define SAVE_XMMS save_xmms +#define RESTORE_XMMS restore_xmms + +void +kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (cipherLengthInBytes == 0 || + cipherLengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_1_buffer(pCtx, IV, pBufferIn, pBufferOut, + cipherLengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (cipherLengthInBits == 0 || + cipherLengthInBits > KASUMI_MAX_LEN) + return; +#endif + kasumi_f8_1_buffer_bit(pCtx, IV, pBufferIn, pBufferOut, + cipherLengthInBits, offsetInBits); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const void *pBufferIn1, + void *pBufferOut1, const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes1 == 0 || lengthInBytes1 > (KASUMI_MAX_LEN / CHAR_BIT)) + return; + + if (lengthInBytes2 == 0 || lengthInBytes2 > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_2_buffer(pCtx, IV1, IV2, + pBufferIn1, pBufferOut1, lengthInBytes1, + pBufferIn2, pBufferOut2, lengthInBytes2); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const uint64_t IV3, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + if (pBufferIn3 == NULL || pBufferOut3 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_3_buffer(pCtx, IV1, IV2, IV3, + pBufferIn1, pBufferOut1, + pBufferIn2, pBufferOut2, + pBufferIn3, pBufferOut3, lengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const uint64_t IV3, const uint64_t IV4, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + if (pBufferIn3 == NULL || pBufferOut3 == NULL) + return; + + if (pBufferIn4 == NULL || pBufferOut4 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_4_buffer(pCtx, IV1, IV2, IV3, IV4, + pBufferIn1, pBufferOut1, + pBufferIn2, pBufferOut2, + pBufferIn3, pBufferOut3, + pBufferIn4, pBufferOut4, + lengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule, + const uint64_t IV[], + const void * const pDataIn[], void *pDataOut[], + const uint32_t dataLen[], const uint32_t dataCount) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + uint32_t numLeft = dataCount; + const uint64_t *IVPtr; + const void * const *pDataInPtr; + void **pDataOutPtr; + const uint32_t *dataLenPtr; + uint32_t i = 0; + uint32_t numBuffs; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKeySchedule == NULL || pDataIn == NULL || pDataOut == NULL || + dataLen == NULL || IV == NULL) + return; + + for (i = 0; i < dataCount; i++) { + /* Check for NULL pointers */ + if (pDataIn[i] == NULL || pDataOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (dataLen[i] == 0 || dataLen[i] > (KASUMI_MAX_LEN / CHAR_BIT)) + return; + } +#endif + + i = 0; + + /* KASUMI F8 n buffer function can handle up to 16 buffers */ + while (numLeft > 0) { + IVPtr = &IV[i]; + pDataInPtr = &pDataIn[i]; + pDataOutPtr = &pDataOut[i]; + dataLenPtr = &dataLen[i]; + numBuffs = (numLeft > 16) ? 16 : numLeft; + + kasumi_f8_n_buffer(pKeySchedule, IVPtr, pDataInPtr, pDataOutPtr, + dataLenPtr, numBuffs); + i += numBuffs; + numLeft -= numBuffs; + } +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + + +void +kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx, const void *pBufferIn, + const uint32_t lengthInBytes, void *pDigest) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f9_1_buffer(pCtx, pBufferIn, lengthInBytes, pDigest); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, const uint32_t lengthInBits, + void *pDigest, const uint32_t direction) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBits == 0 || lengthInBits > KASUMI_MAX_LEN) + return; +#endif + kasumi_f9_1_buffer_user(pCtx, IV, pBufferIn, lengthInBits, + pDigest, direction); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +int +kasumi_init_f8_key_sched_sse(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_init_f8_key_sched(pKey, pCtx); +} + +int +kasumi_init_f9_key_sched_sse(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_init_f9_key_sched(pKey, pCtx); +} + +size_t +kasumi_key_sched_size_sse(void) +{ + return kasumi_key_sched_size(); +} diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm new file mode 100644 index 000000000..305c80342 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_sse +%include "sse/mb_mgr_aes_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm new file mode 100644 index 000000000..c9129e758 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_sse +%include "sse/mb_mgr_aes_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm new file mode 100644 index 000000000..2c8afece9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_sse +%include "sse/mb_mgr_aes_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm new file mode 100644 index 000000000..55f7767f4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_sse +%include "sse/mb_mgr_aes_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm new file mode 100644 index 000000000..7aca39f25 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_ccm_auth_submit_flush_sse.asm @@ -0,0 +1,518 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/const.inc" +%include "include/memcpy.asm" + +%ifndef AES128_CBC_MAC + +%define AES128_CBC_MAC aes128_cbc_mac_x4 +%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_sse +%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_sse + +%endif + +extern AES128_CBC_MAC + +section .data +default rel + +align 16 +len_masks: + dq 0x000000000000FFFF, 0x0000000000000000 + dq 0x00000000FFFF0000, 0x0000000000000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +counter_mask: + dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%define NROUNDS 9 ; AES-CCM-128 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax +%define tmp4 rax +%define auth_len_aad rax + +%define min_idx rbp +%define flags rbp + +%define lane r8 + +%define iv_len r9 +%define auth_len r9 + +%define aad_len r10 +%define init_block_addr r11 + +%define unused_lanes rbx +%define r rbx + +%define tmp r12 +%define tmp2 r13 +%define tmp3 r14 + +%define good_lane r15 +%define min_job r15 + +%define init_block0 xmm0 +%define ccm_lens xmm1 +%define min_len_idx xmm2 +%define xtmp0 xmm3 +%define xtmp1 xmm4 +%define xtmp2 xmm5 +%define xtmp3 xmm6 + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + pxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + aesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + aesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + +;;; =========================================================================== +;;; AES CCM auth job submit & flush +;;; =========================================================================== +;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection +%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE 1 +%define %%SUBMIT_FLUSH %1 + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ;; Find free lane + mov unused_lanes, [state + _aes_ccm_unused_lanes] + +%ifidn %%SUBMIT_FLUSH, SUBMIT + + mov lane, unused_lanes + and lane, 15 + shr unused_lanes, 4 + mov [state + _aes_ccm_unused_lanes], unused_lanes + + ;; Copy job info into lane + mov [state + _aes_ccm_job_in_lane + lane*8], job + ;; Copy keys into lane args + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_ccm_args_keys + lane*8], tmp + ;; init_done = 0 + mov word [state + _aes_ccm_init_done + lane*2], 0 + lea tmp, [lane * 8] + + pxor init_block0, init_block0 + movdqa [state + _aes_ccm_args_IV + tmp*2], init_block0 + + ;; Prepare initial Block 0 for CBC-MAC-128 + + ;; Byte 0: flags with L' and M' (AAD later) + ;; Calculate L' = 15 - IV length - 1 = 14 - IV length + mov flags, 14 + mov iv_len, [job + _iv_len_in_bytes] + sub flags, iv_len + ;; Calculate M' = (Digest length - 2) / 2 + mov tmp, [job + _auth_tag_output_len_in_bytes] + sub tmp, 2 + + shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl) + or flags, tmp + + ;; Bytes 1 - 13: Nonce (7 - 13 bytes long) + + ;; Bytes 1 - 7 are always copied (first 7 bytes) + mov tmp, [job + _iv] + pinsrb init_block0, [tmp], 1 + pinsrw init_block0, [tmp + 1], 1 + pinsrd init_block0, [tmp + 3], 1 + + cmp iv_len, 7 + je %%_finish_nonce_move + + cmp iv_len, 8 + je %%_iv_length_8 + cmp iv_len, 9 + je %%_iv_length_9 + cmp iv_len, 10 + je %%_iv_length_10 + cmp iv_len, 11 + je %%_iv_length_11 + cmp iv_len, 12 + je %%_iv_length_12 + + ;; Bytes 8 - 13 +%%_iv_length_13: + pinsrb init_block0, [tmp + 12], 13 +%%_iv_length_12: + pinsrb init_block0, [tmp + 11], 12 +%%_iv_length_11: + pinsrd init_block0, [tmp + 7], 2 + jmp %%_finish_nonce_move +%%_iv_length_10: + pinsrb init_block0, [tmp + 9], 10 +%%_iv_length_9: + pinsrb init_block0, [tmp + 8], 9 +%%_iv_length_8: + pinsrb init_block0, [tmp + 7], 8 + +%%_finish_nonce_move: + + ;; Bytes 14 & 15 (message length), in Big Endian + mov ax, [job + _msg_len_to_hash_in_bytes] + xchg al, ah + pinsrw init_block0, ax, 7 + + mov aad_len, [job + _cbcmac_aad_len] + ;; Initial length to authenticate (Block 0) + mov auth_len, 16 + ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded, + ;; so length is multiple of 64B) + lea auth_len_aad, [aad_len + (2 + 15) + 16] + and auth_len_aad, -16 + + or aad_len, aad_len + cmovne auth_len, auth_len_aad + ;; Update lengths to authenticate and find min length + movdqa ccm_lens, [state + _aes_ccm_lens] + XPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16 + movdqa [state + _aes_ccm_lens], ccm_lens + phminposuw min_len_idx, ccm_lens + + mov tmp, lane + shl tmp, 6 + lea init_block_addr, [state + _aes_ccm_init_blocks + tmp] + or aad_len, aad_len + je %%_aad_complete + + or flags, (1 << 6) ; Set Adata bit in flags + + ;; Copy AAD + ;; Set all 0s in last block (padding) + lea tmp, [init_block_addr + auth_len] + sub tmp, 16 + pxor xtmp0, xtmp0 + movdqa [tmp], xtmp0 + + ;; Start copying from second block + lea tmp, [init_block_addr+16] + mov rax, aad_len + xchg al, ah + mov [tmp], ax + add tmp, 2 + mov tmp2, [job + _cbcmac_aad] + memcpy_sse_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3 + +%%_aad_complete: + + ;; Finish Block 0 with Byte 0 + pinsrb init_block0, BYTE(flags), 0 + movdqa [init_block_addr], init_block0 + + ;; args.in[lane] = &initial_block + mov [state + _aes_ccm_args_in + lane * 8], init_block_addr + + cmp byte [state + _aes_ccm_unused_lanes], 0xf + jne %%_return_null + +%else ; end SUBMIT + + ;; Check at least one job + bt unused_lanes, 19 + jc %%_return_null + + ;; Find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_ccm_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_ccm_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_ccm_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + + ; Copy good_lane to empty lanes + movzx tmp, word [state + _aes_ccm_init_done + good_lane*2] + mov tmp2, [state + _aes_ccm_args_in + good_lane*8] + mov tmp3, [state + _aes_ccm_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + movdqa xtmp0, [state + _aes_ccm_args_IV + good_lane] + movdqa ccm_lens, [state + _aes_ccm_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_ccm_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + por ccm_lens, [rel len_masks + 16*I] + mov [state + _aes_ccm_init_done + I*2], WORD(tmp) + mov [state + _aes_ccm_args_in + I*8], tmp2 + mov [state + _aes_ccm_args_keys + I*8], tmp3 + movdqa [state + _aes_ccm_args_IV + I*16], xtmp0 +APPEND(skip_,I): +%assign I (I+1) +%endrep + movdqa [state + _aes_ccm_lens], ccm_lens + ;; Find min length + phminposuw min_len_idx, ccm_lens + +%endif ; end FLUSH + +%%_ccm_round: + pextrw len2, min_len_idx, 0 ; min value + pextrw min_idx, min_len_idx, 1 ; min index (0...3) + + mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8] + + or len2, len2 + je %%_len_is_0 + ;; subtract min length from all lengths + pshuflw min_len_idx, min_len_idx, 0 ; broadcast min length + psubw ccm_lens, min_len_idx + movdqa [state + _aes_ccm_lens], ccm_lens + + ; "state" and "args" are the same address, arg1 + ; len2 is arg2 + call AES128_CBC_MAC + ; state and min_idx are intact + +%%_len_is_0: + + movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2] + cmp WORD(tmp), 0 + je %%_prepare_full_blocks_to_auth + cmp WORD(tmp), 1 + je %%_prepare_partial_block_to_auth + +%%_encrypt_digest: + + ;; Set counter block 0 (reusing previous initial block 0) + mov tmp, min_idx + shl tmp, 3 + movdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8] + + pand init_block0, [rel counter_mask] + + mov tmp2, [state + _aes_ccm_args_keys + tmp] + ENCRYPT_SINGLE_BLOCK tmp2, init_block0 + pxor init_block0, [state + _aes_ccm_args_IV + tmp * 2] + + ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16) + mov min_job, [state + _aes_ccm_job_in_lane + tmp] + mov tmp3, [min_job + _auth_tag_output_len_in_bytes] + mov tmp2, [min_job + _auth_tag_output] + + simd_store_sse tmp2, init_block0, tmp3, tmp, rax + +%%_update_lanes: + ; Update unused lanes + mov unused_lanes, [state + _aes_ccm_unused_lanes] + shl unused_lanes, 4 + or unused_lanes, min_idx + mov [state + _aes_ccm_unused_lanes], unused_lanes + + ; Set return job + mov job_rax, min_job + + mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + +%ifdef SAFE_DATA + pxor xtmp0, xtmp0 +%ifidn %%SUBMIT_FLUSH, SUBMIT + shl min_idx, 3 + ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job + movdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0 + movdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0 + movdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0 + movdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0 + movdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0 + mov qword [state + _aes_ccm_args_keys + min_idx], 0 +%else + ;; Clear digest (in memory for CBC IV), counter block 0 and AAD + ;; of returned job and "NULL lanes" +%assign I 0 +%rep 4 + cmp qword [state + _aes_ccm_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + movdqa [state + _aes_ccm_args_IV + I*16], xtmp0 + movdqa [state + _aes_ccm_init_blocks + I*64], xtmp0 + movdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0 + movdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0 + movdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0 + mov qword [state + _aes_ccm_args_keys + I*8], 0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SUBMIT +%endif ;; SAFE_DATA + +%%_return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%%_return_null: + xor job_rax, job_rax + jmp %%_return + +%%_prepare_full_blocks_to_auth: + + cmp dword [min_job + _cipher_direction], 2 ; DECRYPT + je %%_decrypt + +%%_encrypt: + mov tmp, [min_job + _src] + add tmp, [min_job + _hash_start_src_offset_in_bytes] + jmp %%_set_init_done_1 + +%%_decrypt: + mov tmp, [min_job + _dst] + +%%_set_init_done_1: + mov [state + _aes_ccm_args_in + min_idx*8], tmp + mov word [state + _aes_ccm_init_done + min_idx*2], 1 + + ; Check if there are full blocks to hash + mov tmp, [min_job + _msg_len_to_hash_in_bytes] + and tmp, -16 + je %%_prepare_partial_block_to_auth + + ;; Update lengths to authenticate and find min length + movdqa ccm_lens, [state + _aes_ccm_lens] + XPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16 + phminposuw min_len_idx, ccm_lens + movdqa [state + _aes_ccm_lens], ccm_lens + + jmp %%_ccm_round + +%%_prepare_partial_block_to_auth: + ; Check if partial block needs to be hashed + mov auth_len, [min_job + _msg_len_to_hash_in_bytes] + and auth_len, 15 + je %%_encrypt_digest + + mov word [state + _aes_ccm_init_done + min_idx * 2], 2 + ;; Update lengths to authenticate and find min length + movdqa ccm_lens, [state + _aes_ccm_lens] + XPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16 + phminposuw min_len_idx, ccm_lens + movdqa [state + _aes_ccm_lens], ccm_lens + + mov tmp2, min_idx + shl tmp2, 6 + add tmp2, 16 ; pb[AES_BLOCK_SIZE] + lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2] + mov tmp2, [state + _aes_ccm_args_in + min_idx * 8] + + simd_load_sse_15_1 xtmp0, tmp2, auth_len + +%%_finish_partial_block_copy: + movdqa [init_block_addr], xtmp0 + mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr + + jmp %%_ccm_round +%endmacro + + +align 64 +; JOB_AES_HMAC * submit_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal) +SUBMIT_JOB_AES_CCM_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE SUBMIT + +; JOB_AES_HMAC * flush_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state) +; arg 1 : state +MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal) +FLUSH_JOB_AES_CCM_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE FLUSH + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm new file mode 100644 index 000000000..01c6315bd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm @@ -0,0 +1,502 @@ +;; +;; Copyright (c) 2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%ifndef AES128_CBC_MAC + +%define AES128_CBC_MAC aes128_cbc_mac_x4 +%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse +%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse + +%endif + +extern AES128_CBC_MAC + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 +%define m_last r10 +%define n r11 + +%define unused_lanes rbx +%define r rbx + +%define tmp3 r12 +%define tmp4 r13 +%define tmp2 r14 + +%define good_lane r15 +%define rbits r15 + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +;;; =========================================================================== +;;; AES CMAC job submit & flush +;;; =========================================================================== +;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection +%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1 +%define %%SUBMIT_FLUSH %1 + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ;; Find free lane + mov unused_lanes, [state + _aes_cmac_unused_lanes] + +%ifidn %%SUBMIT_FLUSH, SUBMIT + + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + mov [state + _aes_cmac_unused_lanes], unused_lanes + + ;; Copy job info into lane + mov [state + _aes_cmac_job_in_lane + lane*8], job + ;; Copy keys into lane args + mov tmp, [job + _key_expanded] + mov [state + _aes_cmac_args_keys + lane*8], tmp + mov tmp, lane + shl tmp, 4 ; lane*16 + + ;; Zero IV to store digest + pxor xmm0, xmm0 + movdqa [state + _aes_cmac_args_IV + tmp], xmm0 + + lea m_last, [state + _aes_cmac_scratch + tmp] + + ;; calculate len + ;; convert bits to bytes (message length in bits for CMAC) + mov len, [job + _msg_len_to_hash_in_bits] + mov rbits, len + add len, 7 ; inc len if there are remainder bits + shr len, 3 + and rbits, 7 + + ;; Check at least 1 or more blocks (get n) + mov n, len + add n, 0xf + shr n, 4 + + ;; Check for partial block + mov r, len + and r, 0xf + + or n, n ; check one or more blocks? + jz %%_lt_one_block + + ;; One or more blocks, potentially partial + mov word [state + _aes_cmac_init_done + lane*2], 0 + + mov tmp2, [job + _src] + add tmp2, [job + _hash_start_src_offset_in_bytes] + mov [state + _aes_cmac_args_in + lane*8], tmp2 + + ;; len = (n-1)*16 + lea tmp2, [n - 1] + shl tmp2, 4 + movdqa xmm0, [state + _aes_cmac_lens] + XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16 + movdqa [state + _aes_cmac_lens], xmm0 + + ;; check remainder bits + or rbits, rbits + jnz %%_not_complete_block_3gpp + + ;; check if complete block + or r, r + jz %%_complete_block + +%%_not_complete_block: + ;; M_last = padding(M_n) XOR K2 + lea tmp, [rel padding_0x80_tab16 + 16] + sub tmp, r + movdqu xmm0, [tmp] + movdqa [m_last], xmm0 + + mov tmp, [job + _src] + add tmp, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp, tmp3 + + memcpy_sse_16 m_last, tmp, r, tmp4, iv + + ;; src + n + r + mov tmp3, [job + _skey2] + movdqa xmm1, [m_last] + movdqu xmm0, [tmp3] + pxor xmm0, xmm1 + movdqa [m_last], xmm0 + +%%_step_5: + ;; Find min length + movdqa xmm0, [state + _aes_cmac_lens] + phminposuw xmm1, xmm0 + + cmp byte [state + _aes_cmac_unused_lanes], 0xf + jne %%_return_null + +%else ; end SUBMIT + + ;; Check at least one job + bt unused_lanes, 19 + jc %%_return_null + + ;; Find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + + ; Copy good_lane to empty lanes + mov tmp2, [state + _aes_cmac_args_in + good_lane*8] + mov tmp3, [state + _aes_cmac_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + movdqa xmm2, [state + _aes_cmac_args_IV + good_lane] + movdqa xmm0, [state + _aes_cmac_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_cmac_args_in + I*8], tmp2 + mov [state + _aes_cmac_args_keys + I*8], tmp3 + movdqa [state + _aes_cmac_args_IV + I*16], xmm2 + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + ;; Find min length + phminposuw xmm1, xmm0 + +%endif ; end FLUSH + +%%_cmac_round: + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je %%_len_is_0 + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_cmac_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len2 is arg2 + call AES128_CBC_MAC + ; state and idx are intact + + movdqa xmm0, [state + _aes_cmac_lens] ; preload lens +%%_len_is_0: + ; Check if job complete + test word [state + _aes_cmac_init_done + idx*2], 0xffff + jnz %%_copy_complete_digest + + ; Finish step 6 + mov word [state + _aes_cmac_init_done + idx*2], 1 + + XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16 + movdqa [state + _aes_cmac_lens], xmm0 + + phminposuw xmm1, xmm0 ; find min length + + mov tmp3, idx + shl tmp3, 4 ; idx*16 + lea m_last, [state + _aes_cmac_scratch + tmp3] + mov [state + _aes_cmac_args_in + idx*8], m_last + + jmp %%_cmac_round + +%%_copy_complete_digest: + ; Job complete, copy digest to AT output + mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] + + mov tmp4, idx + shl tmp4, 4 + lea tmp3, [state + _aes_cmac_args_IV + tmp4] + mov tmp4, [job_rax + _auth_tag_output_len_in_bytes] + mov tmp2, [job_rax + _auth_tag_output] + + cmp tmp4, 16 + jne %%_ne_16_copy + + ;; 16 byte AT copy + movdqu xmm0, [tmp3] + movdqu [tmp2], xmm0 + jmp %%_update_lanes + +%%_ne_16_copy: + memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv + +%%_update_lanes: + ; Update unused lanes + mov unused_lanes, [state + _aes_cmac_unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _aes_cmac_unused_lanes], unused_lanes + + ; Set return job + mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] + + mov qword [state + _aes_cmac_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%ifidn %%SUBMIT_FLUSH, SUBMIT + ;; Clear digest (in memory for IV) and scratch memory of returned job + movdqa [tmp3], xmm0 + + shl idx, 4 + movdqa [state + _aes_cmac_scratch + idx], xmm0 + +%else + ;; Clear digest and scratch memory of returned job and "NULL lanes" +%assign I 0 +%rep 4 + cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + movdqa [state + _aes_cmac_args_IV + I*16], xmm0 + movdqa [state + _aes_cmac_scratch + I*16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif ;; SUBMIT + +%endif ;; SAFE_DATA + +%%_return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%%_return_null: + xor job_rax, job_rax + jmp %%_return + +%ifidn %%SUBMIT_FLUSH, SUBMIT +%%_complete_block: + + ;; Block size aligned + mov tmp2, [job + _src] + add tmp2, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp2, tmp3 + + ;; M_last = M_n XOR K1 + mov tmp3, [job + _skey1] + movdqu xmm0, [tmp3] + movdqu xmm1, [tmp2] + pxor xmm0, xmm1 + movdqa [m_last], xmm0 + + jmp %%_step_5 + +%%_lt_one_block: + ;; Single partial block + mov word [state + _aes_cmac_init_done + lane*2], 1 + mov [state + _aes_cmac_args_in + lane*8], m_last + + movdqa xmm0, [state + _aes_cmac_lens] + XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16 + movdqa [state + _aes_cmac_lens], xmm0 + + mov n, 1 + jmp %%_not_complete_block + +%%_not_complete_block_3gpp: + ;; bit pad last block + ;; xor with skey2 + ;; copy to m_last + + ;; load pointer to src + mov tmp, [job + _src] + add tmp, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp, tmp3 + + ;; check if partial block + or r, r + jz %%_load_full_block_3gpp + + simd_load_sse_15_1 xmm0, tmp, r + dec r + +%%_update_mlast_3gpp: + ;; set last byte padding mask + ;; shift into correct xmm idx + + ;; save and restore rcx on windows +%ifndef LINUX + mov tmp, rcx +%endif + mov rcx, rbits + mov tmp3, 0xff + shr tmp3, cl + movq xmm2, tmp3 + XPSLLB xmm2, r, xmm1, tmp2 + + ;; pad final byte + pandn xmm2, xmm0 +%ifndef LINUX + mov rcx, tmp +%endif + ;; set OR mask to pad final bit + mov tmp2, tmp3 + shr tmp2, 1 + xor tmp2, tmp3 ; XOR to get OR mask + movq xmm3, tmp2 + ;; xmm1 contains shift table from previous shift + pshufb xmm3, xmm1 + + ;; load skey2 address + mov tmp3, [job + _skey2] + movdqu xmm1, [tmp3] + + ;; set final padding bit + por xmm2, xmm3 + + ;; XOR last partial block with skey2 + ;; update mlast + pxor xmm2, xmm1 + movdqa [m_last], xmm2 + + jmp %%_step_5 + +%%_load_full_block_3gpp: + movdqu xmm0, [tmp] + mov r, 0xf + jmp %%_update_mlast_3gpp +%endif +%endmacro + + +align 64 +; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal) +SUBMIT_JOB_AES_CMAC_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT + +; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state) +; arg 1 : state +MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal) +FLUSH_JOB_AES_CMAC_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm new file mode 100644 index 000000000..0066aff9f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm @@ -0,0 +1,217 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" + +%ifndef AES_CBC_ENC_X4 +%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_sse +%endif + +; void AES_CBC_ENC_X4(AES_ARGS *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X4 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define good_lane rdx +%define iv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp rbp +%define idx rbp + +%define tmp3 r8 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) +FLUSH_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + + ; copy good_lane to empty lanes + mov tmp1, [state + _aes_args_in + good_lane*8] + mov tmp2, [state + _aes_args_out + good_lane*8] + mov tmp3, [state + _aes_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + movdqa xmm2, [state + _aes_args_IV + good_lane] + movdqa xmm0, [state + _aes_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_args_in + I*8], tmp1 + mov [state + _aes_args_out + I*8], tmp2 + mov [state + _aes_args_keys + I*8], tmp3 + movdqa [state + _aes_args_IV + I*16], xmm2 + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _aes_unused_lanes], unused_lanes +%ifdef SAFE_DATA + ;; Clear IVs of returned job and "NULL lanes" + pxor xmm0, xmm0 +%assign I 0 +%rep 4 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + movdqa [state + _aes_args_IV + I*16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm new file mode 100644 index 000000000..702fb91a4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm @@ -0,0 +1,187 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/const.inc" + +%ifndef AES_CBC_ENC_X4 + +%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_sse + +%endif + +; void AES_CBC_ENC_X4(AES_ARGS *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X4 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +section .text + +; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) +SUBMIT_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + mov iv, [job + _iv] + mov [state + _aes_unused_lanes], unused_lanes + + mov [state + _aes_job_in_lane + lane*8], job + mov tmp, [job + _src] + add tmp, [job + _cipher_start_src_offset_in_bytes] + movdqu xmm0, [iv] + mov [state + _aes_args_in + lane*8], tmp + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_args_keys + lane*8], tmp + mov tmp, [job + _dst] + mov [state + _aes_args_out + lane*8], tmp + shl lane, 4 ; multiply by 16 + movdqa [state + _aes_args_IV + lane], xmm0 + + ;; insert len into proper lane + mov len, [job + _msg_len_to_cipher_in_bytes] + and len, -16 ; DOCSIS may pass size unaligned to block size + + movdqa xmm0, [state + _aes_lens] + XPINSRW xmm0, xmm1, tmp, lane, len, no_scale + movdqa [state + _aes_lens], xmm0 + + cmp unused_lanes, 0xff + jne return_null + + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _aes_unused_lanes], unused_lanes +%ifdef SAFE_DATA + ;; Clear IV + pxor xmm0, xmm0 + shl idx, 3 ; multiply by 8 + movdqa [state + _aes_args_IV + idx*2], xmm0 + mov qword [state + _aes_args_keys + idx], 0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm new file mode 100644 index 000000000..6069ce17a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm @@ -0,0 +1,242 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" + +%ifndef AES_XCBC_X4 +%define AES_XCBC_X4 aes_xcbc_mac_128_x4 +%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse +%endif + +; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X4 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define icv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp r10 +%define idx rbp + +%define tmp3 r8 +%define lane_data r9 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) +FLUSH_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + mov tmp1, [state + _aes_xcbc_args_in + idx*8] + mov tmp3, [state + _aes_xcbc_args_keys + idx*8] + shl idx, 4 ; multiply by 16 + movdqa xmm2, [state + _aes_xcbc_args_ICV + idx] + movdqa xmm0, [state + _aes_xcbc_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _aes_xcbc_args_in + I*8], tmp1 + mov [state + _aes_xcbc_args_keys + I*8], tmp3 + movdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _aes_xcbc_lens], xmm0 + + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp copy_lane_data + +end_loop: + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + movq [icv], xmm0 + pextrd [icv + 8], xmm0, 2 + + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear ICV's and final blocks in returned job and NULL lanes +%assign I 0 +%rep 4 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_clear_,I) + movdqa [state + _aes_xcbc_args_ICV + I*16], xmm0 + lea lane_data, [state + _aes_xcbc_ldata + (I * _XCBC_LANE_DATA_size)] + movdqa [lane_data + _xcbc_final_block], xmm0 + movdqa [lane_data + _xcbc_final_block + 16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm new file mode 100644 index 000000000..e61cc07b1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm @@ -0,0 +1,263 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/const.inc" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%ifndef AES_XCBC_X4 +%define AES_XCBC_X4 aes_xcbc_mac_128_x4 +%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse +%endif + +; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X4 + +section .data +default rel + +align 16 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define idx rbp +%define last_len rbp + +%define lane r8 + +%define icv r9 +%define p2 r9 + +%define tmp r10 +%define len r11 +%define lane_data r12 +%define p r13 +%define tmp2 r14 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) +SUBMIT_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + mov [state + _aes_xcbc_unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov [lane_data + _xcbc_job_in_lane], job + mov dword [lane_data + _xcbc_final_done], 0 + mov tmp, [job + _k1_expanded] + mov [state + _aes_xcbc_args_keys + lane*8], tmp + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + + mov last_len, len + + cmp len, 16 + jle small_buffer + + mov [state + _aes_xcbc_args_in + lane*8], p + add p, len ; set point to end of data + + and last_len, 15 ; Check lsbs of msg len + jnz slow_copy ; if not 16B mult, do slow copy + +fast_copy: + movdqu xmm0, [p - 16] ; load last block M[n] + mov tmp, [job + _k2] ; load K2 address + movdqu xmm1, [tmp] ; load K2 + pxor xmm0, xmm1 ; M[n] XOR K2 + movdqa [lane_data + _xcbc_final_block], xmm0 + sub len, 16 ; take last block off length +end_fast_copy: + pxor xmm0, xmm0 + shl lane, 4 ; multiply by 16 + movdqa [state + _aes_xcbc_args_ICV + lane], xmm0 + + ;; insert len into proper lane + movdqa xmm0, [state + _aes_xcbc_lens] + XPINSRW xmm0, xmm1, tmp, lane, len, no_scale + movdqa [state + _aes_xcbc_lens], xmm0 + + cmp unused_lanes, 0xff + jne return_null + +start_loop: + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + movdqa xmm0, [state + _aes_xcbc_lens] + jmp start_loop + +end_loop: + ; process completed job "idx" + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + movq [icv], xmm0 + pextrd [icv + 8], xmm0, 2 + +%ifdef SAFE_DATA + ;; Clear ICV + pxor xmm0, xmm0 + movdqa [state + _aes_xcbc_args_ICV + idx], xmm0 + + ;; Clear final block (32 bytes) + movdqa [lane_data + _xcbc_final_block], xmm0 + movdqa [lane_data + _xcbc_final_block + 16], xmm0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +small_buffer: + ; For buffers <= 16 Bytes + ; The input data is set to final block + lea tmp, [lane_data + _xcbc_final_block] ; final block + mov [state + _aes_xcbc_args_in + lane*8], tmp + add p, len ; set point to end of data + cmp len, 16 + je fast_copy + +slow_copy: + and len, ~15 ; take final block off len + sub p, last_len ; adjust data pointer + lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final + sub p2, last_len ; adjust data pointer backwards + memcpy_sse_16_1 p2, p, last_len, tmp, tmp2 + movdqa xmm0, [rel x80] ; fill reg with padding + movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding + movdqu xmm0, [p2] ; load final block to process + mov tmp, [job + _k3] ; load K3 address + movdqu xmm1, [tmp] ; load K3 + pxor xmm0, xmm1 ; M[n] XOR K3 + movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm new file mode 100644 index 000000000..ac1bb8691 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm @@ -0,0 +1,305 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX RCX RDX R8 +;; Windows preserves: RBX RBP RSI RDI R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX RSI RDI R8 +;; Linux preserves: RBX RCX RDX RBP R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha1_ni + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +one: + dq 1 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define p2 r8 + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : state +MKGLOBAL(flush_job_hmac_ni_sse,function,internal) +flush_job_hmac_ni_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-ni-sse flush" + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job, assume it is 0 then check 1 + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + DBGPRINTL64 "idx:", idx + +copy_lane_data: + ; copy valid lane (idx) to empty lanes + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + movzx len2, word [state + _lens + idx*2] + + DBGPRINTL64 "ptr", tmp + + ; there are only two lanes so if one is empty it is easy to determine which one + xor idx, 1 + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + xor idx, 1 + + ; No need to find min length - only two lanes available + cmp len2, 0 + je len_is_0 + + ; Set length on both lanes to 0 + mov dword [state + _lens], 0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_ni + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + DBGPRINTL64 "outer-block-index", idx + lea tmp, [lane_data + _outer_block] + DBGPRINTL64 "outer block ptr:", tmp + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p2, [idx + idx*4] + movdqu xmm0, [state + _args_digest + p2*4] + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 + DBGPRINTL64 "sha1 outer hash input word 4", tmp + mov job, [lane_data + _job_in_lane] + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] + movdqu [state + _args_digest + p2*4], xmm0 + mov [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + DBGPRINTL64 "extra blocks-start offset", start_offset + mov [state + _lens + 2*idx], WORD(extra_blocks) + DBGPRINTL64 "extra blocks-len", extra_blocks + lea tmp, [lane_data + _extra_block + start_offset] + DBGPRINTL64 "extra block ptr", tmp + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea idx, [idx + idx*4] + mov DWORD(tmp2), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp4), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov DWORD(tmp2), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp2) + mov [p + 2*4], DWORD(tmp2) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp2), [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp4), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 3*4], DWORD(tmp2) + mov [p + 4*4], DWORD(tmp4) + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 2 + cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest + movdqu [state + _args_digest + I*20], xmm0 + mov dword [state + _args_digest + I*20 + 16], 0 + + lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 20 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm new file mode 100644 index 000000000..0f760b01c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm @@ -0,0 +1,302 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha1_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : state +MKGLOBAL(flush_job_hmac_sse,function,internal) +flush_job_hmac_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-sse flush" + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] +copy_lane_data: + ; copy valid lane (idx) to empty lanes + movdqa xmm0, [state + _lens] + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_sse + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows + movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*4], DWORD(tmp) + DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 + DBGPRINTL64 "sha1 outer hash input word 4", tmp + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + mov [p + 2*4], DWORD(tmp2) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp4) + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 4 + cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0 + + lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 20 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm new file mode 100644 index 000000000..d23f37976 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm @@ -0,0 +1,318 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern md5_x4x2_sse + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define idx rbp + +; unused_lanes must be in rax-rdx +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define tmp5 r9 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_md5_sse,function,internal) +flush_job_hmac_md5_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel one] + cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel two] + cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel three] + cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel four] + cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel five] + cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel six] + cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy good lane (idx) to empty lanes + movdqa xmm0, [state + _lens_md5] + mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_md5], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshufb xmm1, [rel dupw] ; duplicate words across all lanes + psubw xmm0, xmm1 + movdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; pshufb xmm0, [byteswap wrt rip] + movdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_md5] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] +; bswap DWORD(tmp2) +; bswap DWORD(tmp4) +; bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp5) + + cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ; copy 16 bytes + mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] + mov [p + 3*4], DWORD(tmp5) + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest (16B), outer_block (16B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (16 bytes) +%assign J 0 +%rep 4 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep + + lea lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 16 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm new file mode 100644 index 000000000..acf78fd6d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm @@ -0,0 +1,356 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/memcpy.asm" +%include "include/reg_sizes.asm" +%include "include/const.inc" + +extern md5_x4x2_sse + +section .data +default rel +align 16 +;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_md5_sse,function,internal) +submit_job_hmac_md5_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov [state + _unused_lanes_md5], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + ;; insert len into proper lane + movdqa xmm0, [state + _lens_md5] + XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + movdqa [state + _lens_md5], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] +; bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + movdqa xmm0, [state + _lens_md5] + XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + movdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xf + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_md5] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshufb xmm1, [rel dupw] ; duplicate words across all lanes + psubw xmm0, xmm1 + movdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + movdqa xmm0, [state + _lens_md5] + XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + movdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + movdqa xmm0, [state + _lens_md5] + XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + movdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exiting + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_md5] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_md5] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + + cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ; copy 16 bytes + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] + mov [p + 3*4], DWORD(tmp3) + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0 + + pxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 16 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm new file mode 100644 index 000000000..23fcd74d7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm @@ -0,0 +1,28 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; +%define SHA224 +%include "sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm new file mode 100644 index 000000000..e1f11a44f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_224_sse +%define SHA224 + +%include "sse/mb_mgr_hmac_sha_256_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm new file mode 100644 index 000000000..12c0350af --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm @@ -0,0 +1,28 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; +%define SHA224 +%include "sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm new file mode 100644 index 000000000..111f5092c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_224_sse +%define SHA224 + +%include "sse/mb_mgr_hmac_sha_256_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm new file mode 100644 index 000000000..9a2f20ffc --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm @@ -0,0 +1,333 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha256_ni + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%define bswap_xmm4 xmm4 + +struc STACK +_gpr_save: resq 4 ;rbx, rbp, rsi (win), rdi (win) +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +one: dq 1 + +section .text + +%ifdef SHA224 +;; JOB* flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) +;; arg1 : state +MKGLOBAL(flush_job_hmac_sha_224_ni_sse,function,internal) +flush_job_hmac_sha_224_ni_sse: +%else +;; JOB* flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) +;; arg1 : state +MKGLOBAL(flush_job_hmac_sha_256_ni_sse,function,internal) +flush_job_hmac_sha_256_ni_sse: +%endif + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha256-ni-sse flush" + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job, assume it is 0 then check 1 + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + DBGPRINTL64 "idx:", idx + +copy_lane_data: + ; copy idx to empty lanes + mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx] + xor len2, len2 + mov WORD(len2), word [state + _lens_sha256 + idx*2] + + ; there are only two lanes so if one is empty it is easy to determine which one + xor idx, 1 + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + xor idx, 1 + + ; No need to find min length - only two lanes available + cmp len2, 0 + je len_is_0 + + ; set length on both lanes to 0 + mov dword [state + _lens_sha256], 0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_ni + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + movdqa bswap_xmm4, [rel byteswap] + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 + movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] + movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + ;; overwrite top 4 bytes with 0x80 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + DBGPRINTL "sha256 outer hash input words:" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + DBGPRINTL64 "auth_key_xor_opad", tmp + movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 + DBGPRINTL "new digest args" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 16 bytes for SHA256, 14 bytes for SHA224 +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + shl idx, 5 + +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + movdqu xmm0, [state + _args_digest_sha256 + idx] + pshufb xmm0, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movq [p + 0*4], xmm0 + pextrd [p + 2*4], xmm0, 2 + pextrw [p + 3*4], xmm0, 6 +%else + ;; SHA256 + movdqu [p], xmm0 +%endif + DBGPRINTL "auth_tag_output:" + DBGPRINT_XMM xmm0 + jmp clear_ret + +copy_full_digest: + movdqu xmm0, [state + _args_digest_sha256 + idx] + movdqu xmm1, [state + _args_digest_sha256 + idx + 16] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movdqu [p], xmm0 + movq [p + 16], xmm1 + pextrd [p + 16 + 8], xmm1, 2 +%else + ;; SHA256 + movdqu [p], xmm0 + movdqu [p + 16], xmm1 +%endif + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest, outer_block (28B/32B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest + movdqa [state + _args_digest_sha256 + I*32], xmm0 + movdqa [state + _args_digest_sha256 + I*32 + 16], xmm0 + + lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + movdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + movdqa [lane_data + _outer_block + 16], xmm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + DBGPRINTL "exit sha256-ni-sse flush" + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm new file mode 100644 index 000000000..5ab064b89 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm @@ -0,0 +1,356 @@ + ;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern sha_256_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_256_sse +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp; called routine also clobbers r12 +struc STACK +_gpr_save: resq 3 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + movdqa xmm0, [state + _lens_sha256] + mov tmp, [state + _args_data_ptr_sha256 + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_sha256 + 8*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_sha256], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + pshufb xmm1, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + ;; copy 14 bytes for SHA224 / 16 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) +%ifdef SHA224 + mov [p + 3*4], WORD(tmp5) +%else + mov [p + 3*4], DWORD(tmp5) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 28 bytes for SHA224 / 32 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) + mov [p + 3*4], DWORD(tmp5) + + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE] +%ifndef SHA224 + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE] +%endif + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) +%ifndef SHA224 + bswap DWORD(tmp5) +%endif + mov [p + 4*4], DWORD(tmp2) + mov [p + 5*4], DWORD(tmp4) + mov [p + 6*4], DWORD(tmp6) +%ifndef SHA224 + mov [p + 7*4], DWORD(tmp5) +%endif + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes) +%assign J 0 +%rep 7 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%ifndef SHA224 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + movdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + movdqa [lane_data + _outer_block + 16], xmm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm new file mode 100644 index 000000000..d4ded1f6d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm @@ -0,0 +1,401 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha256_ni + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%define bswap_xmm4 xmm4 + +struc STACK +_gpr_save: resq 4 ; rbx, rbp, rsi (win), rdi (win) +_rsp_save: resq 1 +endstruc + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +section .text + +%ifdef SHA224 +; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal) +submit_job_hmac_sha_224_ni_sse: + +%else + +; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal) +submit_job_hmac_sha_256_ni_sse: +%endif + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha256-ni-sse submit" + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + DBGPRINTL64 "lane: ", lane + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + DBGPRINTL64 "length: ", len + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_sha256 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp, [lane*8] ; x8 here plus x4 scale factor give x32 + movdqu [state + _args_digest_sha256 + tmp*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1 + DBGPRINTL "args digest:" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length - only two lanes available + xor len2, len2 + mov tmp, 0x10000 + mov WORD(len2), word [state + _lens_sha256 + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) + mov WORD(tmp), word [state + _lens_sha256 + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) + cmp WORD(len2), WORD(tmp) + cmovg DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length + + mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields + shr DWORD(idx), 16 + and DWORD(len2), 0xffff + je len_is_0 + + sub word [state + _lens_sha256 + 0*2], WORD(len2) + sub word [state + _lens_sha256 + 1*2], WORD(len2) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_ni + ; state is intact +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + movdqa bswap_xmm4, [rel byteswap] + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 + movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] + movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + ;; overwrite top 4 bytes with 0x80 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 16 bytes for SHA256, 14 for SHA224 +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + shl idx, 5 + +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + + movdqu xmm0, [state + _args_digest_sha256 + idx] + pshufb xmm0, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movq [p + 0*4], xmm0 + pextrd [p + 2*4], xmm0, 2 + pextrw [p + 3*4], xmm0, 6 +%else + ;; SHA256 + movdqu [p], xmm0 +%endif + jmp clear_ret + +copy_full_digest: + movdqu xmm0, [state + _args_digest_sha256 + idx] + movdqu xmm1, [state + _args_digest_sha256 + idx + 16] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movdqu [p], xmm0 + movq [p + 16], xmm1 + pextrd [p + 16 + 8], xmm1, 2 +%else + ;; SHA256 + movdqu [p], xmm0 + movdqu [p + 16], xmm1 +%endif + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + ;; Clear digest, outer_block (28B/32B) and extra_block (64B) of returned job + movdqa [state + _args_digest_sha256 + idx], xmm0 + movdqa [state + _args_digest_sha256 + idx + 16], xmm0 + + shr idx, 5 ;; Restore lane idx to 0 or 1 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + movdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + movdqa [lane_data + _outer_block + 16], xmm0 +%endif +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm new file mode 100644 index 000000000..8025b2f96 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm @@ -0,0 +1,427 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +extern sha_256_mult_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_256_sse +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 +struc STACK +_gpr_save: resq 5 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 +%ifndef LINUX + mov [rsp + _gpr_save + 8*3], rsi + mov [rsp + _gpr_save + 8*4], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + movdqa xmm0, [state + _lens_sha256] + XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + movdqa [state + _lens_sha256], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + movdqa xmm0, [state + _lens_sha256] + XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + movdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_sha256] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + movdqa xmm0, [state + _lens_sha256] + XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + movdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + pshufb xmm1, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + movdqa xmm0, [state + _lens_sha256] + XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + movdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + + ;; copy 14 bytes for SHA224 / 16 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) +%ifdef SHA224 + mov [p + 3*4], WORD(tmp4) +%else + mov [p + 3*4], DWORD(tmp4) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 28 bytes for SHA224 / 32 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + mov [p + 3*4], DWORD(tmp4) + + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE] +%ifndef SHA224 + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE] +%endif + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) +%ifndef SHA224 + bswap DWORD(tmp4) +%endif + mov [p + 4*4], DWORD(tmp) + mov [p + 5*4], DWORD(tmp2) + mov [p + 6*4], DWORD(tmp3) +%ifndef SHA224 + mov [p + 7*4], DWORD(tmp4) +%endif + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job +%assign J 0 +%rep 7 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%ifndef SHA224 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + pxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + movdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + movdqa [lane_data + _outer_block + 16], xmm0 +%endif +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*3] + mov rdi, [rsp + _gpr_save + 8*4] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm new file mode 100644 index 000000000..bc7305001 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_384_sse +%define SHA_X_DIGEST_SIZE 384 + +%include "sse/mb_mgr_hmac_sha_512_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm new file mode 100644 index 000000000..04d7d3aaf --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_384_sse +%define SHA_X_DIGEST_SIZE 384 + +%include "sse/mb_mgr_hmac_sha_512_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm new file mode 100644 index 000000000..40f61fa4d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm @@ -0,0 +1,331 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern sha512_x2_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 +one: dq 1 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_512_sse +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + cmovne idx, [rel one] +copy_lane_data: + ; copy good lane (idx) to empty lanes + movdqa xmm0, [state + _lens_sha512] + mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + jne APPEND(skip_,I) + mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_sha512], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0xA0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8*16)) + movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] + pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1) *SHA512_DIGEST_ROW_SIZE], 1 + pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + movdqu xmm0, [tmp + I * 16] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha512] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + ;; copy 32 bytes for SHA512 // 24 bytes for SHA384 + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp5) +%endif + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp5) +%endif + jmp clear_ret +copy_full_digest: + ;; copy 32 bytes for SHA512 // 24 bytes for SHA384 + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) + bswap QWORD(tmp5) + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) + mov [p + 3*8], QWORD(tmp5) + + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp6) + bswap QWORD(tmp5) +%endif + mov [p + 4*8], QWORD(tmp2) + mov [p + 5*8], QWORD(tmp4) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 6*8], QWORD(tmp6) + mov [p + 7*8], QWORD(tmp5) +%endif + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes) +%assign J 0 +%rep 6 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%if (SHA_X_DIGEST_SIZE != 384) + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0 +%endif + + lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)] + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 8 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 16], xmm0 + movdqa [lane_data + _outer_block + 32], xmm0 +%if (SHA_X_DIGEST_SIZE != 384) + movdqa [lane_data + _outer_block + 48], xmm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm new file mode 100644 index 000000000..0d6da7bce --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm @@ -0,0 +1,412 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +extern sha512_x2_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_512_sse +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512+ lane_data] + mov [state + _unused_lanes_sha512], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 7 ; divide by 128, len in terms of sha512 blocks + + mov [lane_data + _job_in_lane_sha512], job + mov dword [lane_data + _outer_done_sha512], 0 + + movdqa xmm0, [state + _lens_sha512] + XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + movdqa [state + _lens_sha512], xmm0 + + mov last_len, len + and last_len, 127 + lea extra_blocks, [last_len + 17 + 127] + shr extra_blocks, 7 + mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p + + cmp len, 128 + jb copy_lt128 + +fast_copy: + add p, len +%assign I 0 +%rep 2 + movdqu xmm0, [p - 128 + I*4*16 + 0*16] + movdqu xmm1, [p - 128 + I*4*16 + 1*16] + movdqu xmm2, [p - 128 + I*4*16 + 2*16] + movdqu xmm3, [p - 128 + I*4*16 + 3*16] + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 +%assign I (I+1) +%endrep +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 7 + sub size_offset, last_len + add size_offset, 128-8 + mov [lane_data + _size_offset_sha512], DWORD(size_offset) + mov start_offset, 128 + sub start_offset, last_len + mov [lane_data + _start_offset_sha512], DWORD(start_offset) + + lea tmp, [8*128 + 8*len] + bswap tmp + mov [lane_data + _extra_block_sha512 + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + %assign I 0 + %rep 4 + movdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 + %assign I (I+1) + %endrep + test len, ~127 + jnz ge128_bytes + +lt128_bytes: + movdqa xmm0, [state + _lens_sha512] + XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + movdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 + mov dword [lane_data + _extra_blocks_sha512], 0 + +ge128_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_sha512] + phminposuw xmm1, xmm0 + pextrw DWORD(len2), xmm1, 0 ; min value + pextrw DWORD(idx), xmm1, 1 ; min index (0...1) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0XA0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + + movdqa xmm0, [state + _lens_sha512] + XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + movdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8 * 16)) + movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] + pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + movdqu xmm0, [tmp + I*16] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + + movdqa xmm0, [state + _lens_sha512] + XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + movdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp start_loop + + align 16 +copy_lt128: + ;; less than one message block of data + ;; beginning of source block + ;; destination extra block but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 128] + sub p2, len + memcpy_sse_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha512] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov unused_lanes, [state + _unused_lanes_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + + ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512 +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp4) +%endif + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp4) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512 + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) + bswap QWORD(tmp4) + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) + mov [p + 3*8], QWORD(tmp4) + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512 +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp3) + bswap QWORD(tmp4) +%endif + mov [p + 4*8], QWORD(tmp) + mov [p + 5*8], QWORD(tmp2) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 6*8], QWORD(tmp3) + mov [p + 7*8], QWORD(tmp4) +%endif + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job +%assign J 0 +%rep 6 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%if (SHA_X_DIGEST_SIZE != 384) + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + pxor xmm0, xmm0 + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 8 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 16], xmm0 + movdqa [lane_data + _outer_block + 32], xmm0 +%if (SHA_X_DIGEST_SIZE != 384) + movdqa [lane_data + _outer_block + 48], xmm0 +%endif +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm new file mode 100644 index 000000000..e0b0460f4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm @@ -0,0 +1,370 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 +;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 +;; Linux preserves: RBX RBP R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha1_ni + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp +%define p4 rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx +%define p3 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_ni_sse,function,internal) +submit_job_hmac_ni_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-ni-sse submit" + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + DBGPRINTL64 "lane: ", lane + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + DBGPRINTL64 "length: ", len + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + DBGPRINTL64 "src pointer + offset:", p + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p4, [lane + lane*4] + movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 + mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length - only two lanes available + xor len2, len2 + mov p3, 0x10000 + mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) + mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) + cmp WORD(len2), WORD(p3) + cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length + + mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields + shr DWORD(idx), 16 + and DWORD(len2), 0xffff + je len_is_0 + + sub word [state + _lens + 0*2], WORD(len2) + sub word [state + _lens + 1*2], WORD(len2) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_ni + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p3, [idx + idx*4] + movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE] + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] + movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 + mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea idx, [idx + 4*idx] + mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp), [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp2), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + mov [p + 3*4], DWORD(tmp) + mov [p + 4*4], DWORD(tmp2) + +clear_ret: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) + ;; idx = 0 or 5 (depending on lane) + movdqu [state + _args_digest + idx*4], xmm0 + mov dword [state + _args_digest + idx*4 + 16], 0 + + shr idx, 2 ;; idx == 5 ? 1 : 0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear 20 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 +%endif + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm new file mode 100644 index 000000000..bc59e7943 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm @@ -0,0 +1,364 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +extern sha1_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rdi, rsi, rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_sse,function, internal) +submit_job_hmac_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-sse submit" + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + movdqa xmm0, [state + _lens] + XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + movdqa [state + _lens], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + movdqa xmm0, [state + _lens] + XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + movdqa [state + _lens], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + movdqa xmm0, [state + _lens] + jmp start_loop + + align 16 +start_loop: + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_sse + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + movdqa xmm1, [state + _lens] + XPINSRW xmm1, xmm2, tmp, idx, 1, scale_x16 + movdqa [state + _lens], xmm1 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + movdqa xmm0, xmm1 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + movdqa xmm0, [state + _lens] + XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + movdqa [state + _lens], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0 + + pxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + movdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 20 bytes of outer_block + movdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c new file mode 100644 index 000000000..4d862cba2 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c @@ -0,0 +1,809 @@ +/******************************************************************************* + Copyright (c) 2012-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + + +#include +#include +#include + +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse + +#include "intel-ipsec-mb.h" +#include "include/kasumi_internal.h" +#include "include/zuc_internal.h" +#include "include/snow3g.h" + +#include "save_xmms.h" +#include "asm.h" +#include "des.h" +#include "cpu_feature.h" +#include "noaesni.h" + +JOB_AES_HMAC *submit_job_aes128_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes128_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes192_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes192_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes256_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes256_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state); + + +JOB_AES_HMAC *submit_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state); + +JOB_AES_HMAC *submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, + JOB_AES_HMAC *job); + +JOB_AES_HMAC *flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state); + +JOB_AES_HMAC *submit_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state, + JOB_AES_HMAC *job); + +JOB_AES_HMAC *flush_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state); + +JOB_AES_HMAC *submit_job_aes_cntr_sse(JOB_AES_HMAC *job); + +JOB_AES_HMAC *submit_job_aes_cntr_bit_sse(JOB_AES_HMAC *job); + +#define SAVE_XMMS save_xmms +#define RESTORE_XMMS restore_xmms + +#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_sse +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse +#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_sse +#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_sse +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse +#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_sse +#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_sse +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse +#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_sse +#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_sse +#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_sse +#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_sse +#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_sse +#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_sse +#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_sse +#define SUBMIT_JOB_HMAC submit_job_hmac_sse +#define FLUSH_JOB_HMAC flush_job_hmac_sse +#define SUBMIT_JOB_HMAC_NI submit_job_hmac_ni_sse +#define FLUSH_JOB_HMAC_NI flush_job_hmac_ni_sse +#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_sse +#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_sse +#define SUBMIT_JOB_HMAC_SHA_224_NI submit_job_hmac_sha_224_ni_sse +#define FLUSH_JOB_HMAC_SHA_224_NI flush_job_hmac_sha_224_ni_sse +#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_sse +#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_sse +#define SUBMIT_JOB_HMAC_SHA_256_NI submit_job_hmac_sha_256_ni_sse +#define FLUSH_JOB_HMAC_SHA_256_NI flush_job_hmac_sha_256_ni_sse +#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_sse +#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_sse +#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_sse +#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_sse +#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_sse +#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_sse +#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse +#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse + +#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_sse +#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_sse + +#define AES_CBC_DEC_128 aes_cbc_dec_128_sse +#define AES_CBC_DEC_192 aes_cbc_dec_192_sse +#define AES_CBC_DEC_256 aes_cbc_dec_256_sse + +#define AES_CNTR_128 aes_cntr_128_sse +#define AES_CNTR_192 aes_cntr_192_sse +#define AES_CNTR_256 aes_cntr_256_sse + +#define AES_CNTR_CCM_128 aes_cntr_ccm_128_sse + +#define AES_ECB_ENC_128 aes_ecb_enc_128_sse +#define AES_ECB_ENC_192 aes_ecb_enc_192_sse +#define AES_ECB_ENC_256 aes_ecb_enc_256_sse +#define AES_ECB_DEC_128 aes_ecb_dec_128_sse +#define AES_ECB_DEC_192 aes_ecb_dec_192_sse +#define AES_ECB_DEC_256 aes_ecb_dec_256_sse + +#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_sse +#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_sse +#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_sse +#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_sse + +#ifndef NO_GCM +#define AES_GCM_DEC_128 aes_gcm_dec_128_sse +#define AES_GCM_ENC_128 aes_gcm_enc_128_sse +#define AES_GCM_DEC_192 aes_gcm_dec_192_sse +#define AES_GCM_ENC_192 aes_gcm_enc_192_sse +#define AES_GCM_DEC_256 aes_gcm_dec_256_sse +#define AES_GCM_ENC_256 aes_gcm_enc_256_sse + +#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_sse +#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_dec_sse +#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_sse +#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_enc_sse +#endif /* NO_GCM */ + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_sse +#define FLUSH_JOB flush_job_sse +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_sse +#define GET_NEXT_JOB get_next_job_sse +#define GET_COMPLETED_JOB get_completed_job_sse + +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse +#define QUEUE_SIZE queue_size_sse + +/* ====================================================================== */ + +#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_SSE +#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_SSE +#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_SSE +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_SSE +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_SSE + +/* ====================================================================== */ + +#define AES_CFB_128_ONE aes_cfb_128_one_sse + +void aes128_cbc_mac_x4(AES_ARGS *args, uint64_t len); + +#define AES128_CBC_MAC aes128_cbc_mac_x4 + +#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_sse +#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_sse + +#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse +#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse + +/* ====================================================================== */ + +/* + * Used to decide if SHA1/SHA256 SIMD or SHA1NI OOO scheduler should be + * called. + */ +#define HASH_USE_SHAEXT 1 + + +/* ====================================================================== */ + +/* + * GCM submit / flush API for SSE arch + */ +#ifndef NO_GCM +static JOB_AES_HMAC * +submit_job_aes_gcm_dec_sse(MB_MGR *state, JOB_AES_HMAC *job) +{ + DECLARE_ALIGNED(struct gcm_context_data ctx, 16); + (void) state; + + if (16 == job->aes_key_len_in_bytes) + AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else /* assume 32 bytes */ + AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + + job->status = STS_COMPLETED; + return job; +} + +static JOB_AES_HMAC * +flush_job_aes_gcm_dec_sse(MB_MGR *state, JOB_AES_HMAC *job) +{ + (void) state; + (void) job; + return NULL; +} + +static JOB_AES_HMAC * +submit_job_aes_gcm_enc_sse(MB_MGR *state, JOB_AES_HMAC *job) +{ + DECLARE_ALIGNED(struct gcm_context_data ctx, 16); + (void) state; + + if (16 == job->aes_key_len_in_bytes) + AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else /* assume 32 bytes */ + AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + + job->status = STS_COMPLETED; + return job; +} + +static JOB_AES_HMAC * +flush_job_aes_gcm_enc_sse(MB_MGR *state, JOB_AES_HMAC *job) +{ + (void) state; + (void) job; + return NULL; +} +#endif /* NO_GCM */ + +IMB_DLL_LOCAL JOB_AES_HMAC * +submit_job_aes_cntr_sse(JOB_AES_HMAC *job) +{ + if (16 == job->aes_key_len_in_bytes) + AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + else /* assume 32 bytes */ + AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + + job->status |= STS_COMPLETED_AES; + return job; +} + +IMB_DLL_LOCAL JOB_AES_HMAC * +submit_job_aes_cntr_bit_sse(JOB_AES_HMAC *job) +{ + if (16 == job->aes_key_len_in_bytes) + aes_cntr_bit_128_sse(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + aes_cntr_bit_192_sse(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + else /* assume 32 bytes */ + aes_cntr_bit_256_sse(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + + job->status |= STS_COMPLETED_AES; + return job; +} + +/* ====================================================================== */ + +void +init_mb_mgr_sse(MB_MGR *state) +{ + unsigned int j; + uint8_t *p; + size_t size; + + state->features = cpu_feature_adjust(state->flags, + cpu_feature_detect()); + + if (!(state->features & IMB_FEATURE_AESNI)) { + init_mb_mgr_sse_no_aesni(state); + return; + } + + /* Init AES out-of-order fields */ + memset(state->aes128_ooo.lens, 0xFF, + sizeof(state->aes128_ooo.lens)); + memset(&state->aes128_ooo.lens[0], 0, + sizeof(state->aes128_ooo.lens[0]) * 4); + memset(state->aes128_ooo.job_in_lane, 0, + sizeof(state->aes128_ooo.job_in_lane)); + state->aes128_ooo.unused_lanes = 0xFF03020100; + state->aes128_ooo.num_lanes_inuse = 0; + + + memset(state->aes192_ooo.lens, 0xFF, + sizeof(state->aes192_ooo.lens)); + memset(&state->aes192_ooo.lens[0], 0, + sizeof(state->aes192_ooo.lens[0]) * 4); + memset(state->aes192_ooo.job_in_lane, 0, + sizeof(state->aes192_ooo.job_in_lane)); + state->aes192_ooo.unused_lanes = 0xFF03020100; + state->aes192_ooo.num_lanes_inuse = 0; + + + memset(state->aes256_ooo.lens, 0xFF, + sizeof(state->aes256_ooo.lens)); + memset(&state->aes256_ooo.lens[0], 0, + sizeof(state->aes256_ooo.lens[0]) * 4); + memset(state->aes256_ooo.job_in_lane, 0, + sizeof(state->aes256_ooo.job_in_lane)); + state->aes256_ooo.unused_lanes = 0xFF03020100; + state->aes256_ooo.num_lanes_inuse = 0; + + + /* DOCSIS SEC BPI uses same settings as AES128 CBC */ + memset(state->docsis_sec_ooo.lens, 0xFF, + sizeof(state->docsis_sec_ooo.lens)); + memset(&state->docsis_sec_ooo.lens[0], 0, + sizeof(state->docsis_sec_ooo.lens[0]) * 4); + memset(state->docsis_sec_ooo.job_in_lane, 0, + sizeof(state->docsis_sec_ooo.job_in_lane)); + state->docsis_sec_ooo.unused_lanes = 0xFF03020100; + state->docsis_sec_ooo.num_lanes_inuse = 0; + + + /* Init HMAC/SHA1 out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0; + state->hmac_sha_1_ooo.lens[3] = 0; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA1_LANES; j++) { + state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_1_ooo.ldata[j].outer_block; + memset(p + 5*4 + 1, + 0x00, + 64 - 5*4 - 1 - 2); + p[5*4] = 0x80; + p[64-2] = 0x02; + p[64-1] = 0xA0; + } + +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA1 NI out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0xFFFF; + state->hmac_sha_1_ooo.lens[3] = 0xFFFF; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA224 out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0; + state->hmac_sha_224_ooo.lens[3] = 0; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { + state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; + + p = state->hmac_sha_224_ooo.ldata[j].extra_block; + size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block); + memset (p, 0x00, size); + p[64] = 0x80; + + p = state->hmac_sha_224_ooo.ldata[j].outer_block; + size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block); + memset(p, 0x00, size); + p[7*4] = 0x80; /* digest 7 words long */ + p[64-2] = 0x02; /* length in little endian = 0x02E0 */ + p[64-1] = 0xE0; + } +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA224 NI out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0xFFFF; + state->hmac_sha_224_ooo.lens[3] = 0xFFFF; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA_256 out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0; + state->hmac_sha_256_ooo.lens[3] = 0; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { + state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_256_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); /* digest is 8*4 bytes long */ + p[8*4] = 0x80; + p[64-2] = 0x03; /* length of (opad (64*8) bits + 256 bits) + * in hex is 0x300 */ + p[64-1] = 0x00; + } +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA256 NI out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0xFFFF; + state->hmac_sha_256_ooo.lens[3] = 0xFFFF; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA384 out-of-order fields */ + state->hmac_sha_384_ooo.lens[0] = 0; + state->hmac_sha_384_ooo.lens[1] = 0; + state->hmac_sha_384_ooo.lens[2] = 0xFFFF; + state->hmac_sha_384_ooo.lens[3] = 0xFFFF; + state->hmac_sha_384_ooo.lens[4] = 0xFFFF; + state->hmac_sha_384_ooo.lens[5] = 0xFFFF; + state->hmac_sha_384_ooo.lens[6] = 0xFFFF; + state->hmac_sha_384_ooo.lens[7] = 0xFFFF; + state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), + 0x00, SHA_384_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_384_BLOCK_SIZE - + SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); + p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ + /* + * hmac outer block length always of fixed size, it is OKey + * length, a whole message block length, 1024 bits, with padding + * plus the length of the inner digest, which is 384 bits + * 1408 bits == 0x0580. The input message block needs to be + * converted to big endian within the sha implementation + * before use. + */ + p[SHA_384_BLOCK_SIZE - 2] = 0x05; + p[SHA_384_BLOCK_SIZE - 1] = 0x80; + } + + /* Init HMAC/SHA512 out-of-order fields */ + state->hmac_sha_512_ooo.lens[0] = 0; + state->hmac_sha_512_ooo.lens[1] = 0; + state->hmac_sha_512_ooo.lens[2] = 0xFFFF; + state->hmac_sha_512_ooo.lens[3] = 0xFFFF; + state->hmac_sha_512_ooo.lens[4] = 0xFFFF; + state->hmac_sha_512_ooo.lens[5] = 0xFFFF; + state->hmac_sha_512_ooo.lens[6] = 0xFFFF; + state->hmac_sha_512_ooo.lens[7] = 0xFFFF; + state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), + 0x00, SHA_512_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_512_BLOCK_SIZE - + SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); + p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ + /* + * hmac outer block length always of fixed size, it is OKey + * length, a whole message block length, 1024 bits, with padding + * plus the length of the inner digest, which is 512 bits + * 1536 bits == 0x600. The input message block needs to be + * converted to big endian within the sha implementation + * before use. + */ + p[SHA_512_BLOCK_SIZE - 2] = 0x06; + p[SHA_512_BLOCK_SIZE - 1] = 0x00; + } + + /* Init HMAC/MD5 out-of-order fields */ + state->hmac_md5_ooo.lens[0] = 0; + state->hmac_md5_ooo.lens[1] = 0; + state->hmac_md5_ooo.lens[2] = 0; + state->hmac_md5_ooo.lens[3] = 0; + state->hmac_md5_ooo.lens[4] = 0; + state->hmac_md5_ooo.lens[5] = 0; + state->hmac_md5_ooo.lens[6] = 0; + state->hmac_md5_ooo.lens[7] = 0; + state->hmac_md5_ooo.lens[8] = 0xFFFF; + state->hmac_md5_ooo.lens[9] = 0xFFFF; + state->hmac_md5_ooo.lens[10] = 0xFFFF; + state->hmac_md5_ooo.lens[11] = 0xFFFF; + state->hmac_md5_ooo.lens[12] = 0xFFFF; + state->hmac_md5_ooo.lens[13] = 0xFFFF; + state->hmac_md5_ooo.lens[14] = 0xFFFF; + state->hmac_md5_ooo.lens[15] = 0xFFFF; + state->hmac_md5_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < SSE_NUM_MD5_LANES; j++) { + state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; + + p = state->hmac_md5_ooo.ldata[j].extra_block; + size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block); + memset (p, 0x00, size); + p[64] = 0x80; + + p = state->hmac_md5_ooo.ldata[j].outer_block; + size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block); + memset(p, 0x00, size); + p[4*4] = 0x80; + p[64-7] = 0x02; + p[64-8] = 0x80; + } + + /* Init AES/XCBC OOO fields */ + state->aes_xcbc_ooo.lens[0] = 0; + state->aes_xcbc_ooo.lens[1] = 0; + state->aes_xcbc_ooo.lens[2] = 0; + state->aes_xcbc_ooo.lens[3] = 0; + state->aes_xcbc_ooo.lens[4] = 0xFFFF; + state->aes_xcbc_ooo.lens[5] = 0xFFFF; + state->aes_xcbc_ooo.lens[6] = 0xFFFF; + state->aes_xcbc_ooo.lens[7] = 0xFFFF; + state->aes_xcbc_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < 4; j++) { + state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; + state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; + memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); + } + + /* Init AES-CCM auth out-of-order fields */ + for (j = 0; j < 4; j++) { + state->aes_ccm_ooo.init_done[j] = 0; + state->aes_ccm_ooo.lens[j] = 0; + state->aes_ccm_ooo.job_in_lane[j] = NULL; + } + for (; j < 8; j++) + state->aes_ccm_ooo.lens[j] = 0xFFFF; + + state->aes_ccm_ooo.unused_lanes = 0xF3210; + + /* Init AES-CMAC auth out-of-order fields */ + state->aes_cmac_ooo.lens[0] = 0; + state->aes_cmac_ooo.lens[1] = 0; + state->aes_cmac_ooo.lens[2] = 0; + state->aes_cmac_ooo.lens[3] = 0; + state->aes_cmac_ooo.lens[4] = 0xFFFF; + state->aes_cmac_ooo.lens[5] = 0xFFFF; + state->aes_cmac_ooo.lens[6] = 0xFFFF; + state->aes_cmac_ooo.lens[7] = 0xFFFF; + for (j = 0; j < 4; j++) { + state->aes_cmac_ooo.init_done[j] = 0; + state->aes_cmac_ooo.job_in_lane[j] = NULL; + } + state->aes_cmac_ooo.unused_lanes = 0xF3210; + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + + /* set SSE handlers */ + state->get_next_job = get_next_job_sse; + state->submit_job = submit_job_sse; + state->submit_job_nocheck = submit_job_nocheck_sse; + state->get_completed_job = get_completed_job_sse; + state->flush_job = flush_job_sse; + state->queue_size = queue_size_sse; + state->keyexp_128 = aes_keyexp_128_sse; + state->keyexp_192 = aes_keyexp_192_sse; + state->keyexp_256 = aes_keyexp_256_sse; + state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_sse; + state->xcbc_keyexp = aes_xcbc_expand_key_sse; + state->des_key_sched = des_key_schedule; + state->sha1_one_block = sha1_one_block_sse; + state->sha1 = sha1_sse; + state->sha224_one_block = sha224_one_block_sse; + state->sha224 = sha224_sse; + state->sha256_one_block = sha256_one_block_sse; + state->sha256 = sha256_sse; + state->sha384_one_block = sha384_one_block_sse; + state->sha384 = sha384_sse; + state->sha512_one_block = sha512_one_block_sse; + state->sha512 = sha512_sse; + state->md5_one_block = md5_one_block_sse; + state->aes128_cfb_one = aes_cfb_128_one_sse; + + state->eea3_1_buffer = zuc_eea3_1_buffer_sse; + state->eea3_4_buffer = zuc_eea3_4_buffer_sse; + state->eea3_n_buffer = zuc_eea3_n_buffer_sse; + state->eia3_1_buffer = zuc_eia3_1_buffer_sse; + + state->f8_1_buffer = kasumi_f8_1_buffer_sse; + state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_sse; + state->f8_2_buffer = kasumi_f8_2_buffer_sse; + state->f8_3_buffer = kasumi_f8_3_buffer_sse; + state->f8_4_buffer = kasumi_f8_4_buffer_sse; + state->f8_n_buffer = kasumi_f8_n_buffer_sse; + state->f9_1_buffer = kasumi_f9_1_buffer_sse; + state->f9_1_buffer_user = kasumi_f9_1_buffer_user_sse; + state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_sse; + state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_sse; + state->kasumi_key_sched_size = kasumi_key_sched_size_sse; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_sse; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_sse; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_sse; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_sse; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_sse; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_sse; + state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_sse; + state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_sse; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_sse; + state->snow3g_init_key_sched = snow3g_init_key_sched_sse; + state->snow3g_key_sched_size = snow3g_key_sched_size_sse; + +#ifndef NO_GCM + state->gcm128_enc = aes_gcm_enc_128_sse; + state->gcm192_enc = aes_gcm_enc_192_sse; + state->gcm256_enc = aes_gcm_enc_256_sse; + state->gcm128_dec = aes_gcm_dec_128_sse; + state->gcm192_dec = aes_gcm_dec_192_sse; + state->gcm256_dec = aes_gcm_dec_256_sse; + state->gcm128_init = aes_gcm_init_128_sse; + state->gcm192_init = aes_gcm_init_192_sse; + state->gcm256_init = aes_gcm_init_256_sse; + state->gcm128_enc_update = aes_gcm_enc_128_update_sse; + state->gcm192_enc_update = aes_gcm_enc_192_update_sse; + state->gcm256_enc_update = aes_gcm_enc_256_update_sse; + state->gcm128_dec_update = aes_gcm_dec_128_update_sse; + state->gcm192_dec_update = aes_gcm_dec_192_update_sse; + state->gcm256_dec_update = aes_gcm_dec_256_update_sse; + state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_sse; + state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_sse; + state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_sse; + state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_sse; + state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_sse; + state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_sse; + state->gcm128_precomp = aes_gcm_precomp_128_sse; + state->gcm192_precomp = aes_gcm_precomp_192_sse; + state->gcm256_precomp = aes_gcm_precomp_256_sse; + state->gcm128_pre = aes_gcm_pre_128_sse; + state->gcm192_pre = aes_gcm_pre_192_sse; + state->gcm256_pre = aes_gcm_pre_256_sse; +#endif +} + +#include "mb_mgr_code.h" diff --git a/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm new file mode 100644 index 000000000..581e3fade --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm @@ -0,0 +1,787 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute octal MD5 using SSE + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +section .data align=64 +default rel + +align 64 +MKGLOBAL(MD5_TABLE,data,internal) +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + +section .text + +%ifdef LINUX +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define mem1 rcx +%define mem2 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define mem1 rdi +%define mem2 rsi +%endif + +;; rbp is not clobbered + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +%define TBL rax +%define IDX rbx + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +; stack size must be an odd multiple of 8 bytes in size +struc STACK +_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: reso 8 ; stores AA-DD, AA2-DD2 + resb 8 ; for alignment +endstruc +%define STACK_SIZE STACK_size + +%define AA rsp + _DIGEST + 16*0 +%define BB rsp + _DIGEST + 16*1 +%define CC rsp + _DIGEST + 16*2 +%define DD rsp + _DIGEST + 16*3 +%define AA2 rsp + _DIGEST + 16*4 +%define BB2 rsp + _DIGEST + 16*5 +%define CC2 rsp + _DIGEST + 16*6 +%define DD2 rsp + _DIGEST + 16*7 + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movdqa %%t0, %%r0 + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movdqa %%t1, %%r2 + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movdqa %%r1, %%t0 + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movdqa %%r3, %%r0 + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pand %%F,%%X + pxor %%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pxor %%F,%%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,[rel ONES] ; pnot %%F + por %%F,%%X + pxor %%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%tmp, (32-%%imm) + pslld %%reg, %%imm + por %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + paddd %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + paddd %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + paddd %%A, %%FUN + paddd %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) +; +align 32 +MKGLOBAL(md5_x4x2_sse,function,internal) +md5_x4x2_sse: + + sub rsp, STACK_SIZE + + ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 + ;; Initialize digests + movdqa A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE] + movdqa B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE] + movdqa C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE] + movdqa D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE] + + ;; Initialize digests + movdqa A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE] + movdqa B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE] + movdqa C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE] + movdqa D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE] + + lea TBL, [rel MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1+_data_ptr_md5 +0*PTR_SZ] + mov inp1,[arg1+_data_ptr_md5 +1*PTR_SZ] + mov inp2,[arg1+_data_ptr_md5 +2*PTR_SZ] + mov inp3,[arg1+_data_ptr_md5 +3*PTR_SZ] + mov inp4,[arg1+_data_ptr_md5 +4*PTR_SZ] + mov inp5,[arg1+_data_ptr_md5 +5*PTR_SZ] + mov inp6,[arg1+_data_ptr_md5 +6*PTR_SZ] + mov inp7,[arg1+_data_ptr_md5 +7*PTR_SZ] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16],T0 + movdqa [mem1+(I*4+1)*16],T1 + movdqa [mem1+(I*4+2)*16],T2 + movdqa [mem1+(I*4+3)*16],T3 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16 + 16*16],T0 + movdqa [mem1+(I*4+1)*16 + 16*16],T1 + movdqa [mem1+(I*4+2)*16 + 16*16],T2 + movdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + movdqa [AA], A + movdqa [BB], B + movdqa [CC], C + movdqa [DD], D + ; save old digests + movdqa [AA2], A2 + movdqa [BB2], B2 + movdqa [CC2], C2 + movdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; write out digests + movdqu [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A + movdqu [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B + movdqu [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C + movdqu [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D + movdqu [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 + movdqu [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 + movdqu [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 + movdqu [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 +_data_ptr_md5 + 0*PTR_SZ], inp0 + mov [arg1 +_data_ptr_md5 + 1*PTR_SZ], inp1 + mov [arg1 +_data_ptr_md5 + 2*PTR_SZ], inp2 + mov [arg1 +_data_ptr_md5 + 3*PTR_SZ], inp3 + mov [arg1 +_data_ptr_md5 + 4*PTR_SZ], inp4 + mov [arg1 +_data_ptr_md5 + 5*PTR_SZ], inp5 + mov [arg1 +_data_ptr_md5 + 6*PTR_SZ], inp6 + mov [arg1 +_data_ptr_md5 + 7*PTR_SZ], inp7 + + ;; Clear stack frame (72*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep (2*2*16+8) + movdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/pon_sse.asm b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm new file mode 100644 index 000000000..32585f5f8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm @@ -0,0 +1,875 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "job_aes_hmac.asm" +%include "include/os.asm" +%include "include/memcpy.asm" + +;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP +;;; This combination is required by PON/xPON/gPON standard. +;;; Note: BIP is running XOR of double words +;;; Order of operations: +;;; - encrypt: CRC32, AES-CTR and BIP +;;; - decrypt: BIP, AES-CTR and CRC32 + +%ifndef DEC_FN_NAME +%define DEC_FN_NAME submit_job_pon_dec_sse +%endif +%ifndef ENC_FN_NAME +%define ENC_FN_NAME submit_job_pon_enc_sse +%endif +%ifndef ENC_NO_CTR_FN_NAME +%define ENC_NO_CTR_FN_NAME submit_job_pon_enc_no_ctr_sse +%endif +%ifndef DEC_NO_CTR_FN_NAME +%define DEC_NO_CTR_FN_NAME submit_job_pon_dec_no_ctr_sse +%endif + +extern byteswap_const +extern ddq_add_1 + +section .data +default rel + +;;; Precomputed constants for CRC32 (Ethernet FCS) +;;; Details of the CRC algorithm and 4 byte buffer of +;;; {0x01, 0x02, 0x03, 0x04}: +;;; Result Poly Init RefIn RefOut XorOut +;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF +align 16 +rk1: + dq 0x00000000ccaa009e, 0x00000001751997d0 + +align 16 +rk5: + dq 0x00000000ccaa009e, 0x0000000163cd6124 + +align 16 +rk7: + dq 0x00000001f7011640, 0x00000001db710640 + +align 16 +pshufb_shf_table: + ;; use these values for shift registers with the pshufb instruction + dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 + dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +align 16 +init_crc_value: + dq 0x00000000FFFFFFFF, 0x0000000000000000 + +align 16 +mask: + dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 + +align 16 +mask2: + dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF +align 16 +mask3: + dq 0x8080808080808080, 0x8080808080808080 + +align 16 +mask_out_top_bytes: + dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF + dq 0x0000000000000000, 0x0000000000000000 + +;; Precomputed constants for HEC calculation (XGEM header) +;; POLY 0x53900000: +;; k1 = 0xf9800000 +;; k2 = 0xa0900000 +;; k3 = 0x7cc00000 +;; q = 0x46b927ec +;; p_res = 0x53900000 + +align 16 +k3_q: + dq 0x7cc00000, 0x46b927ec + +align 16 +p_res: + dq 0x53900000, 0 + +align 16 +mask_out_top_64bits: + dq 0xffffffff_ffffffff, 0 + +section .text + +%define NUM_AES_ROUNDS 10 + +;; note: leave xmm0 free for implicit blend +%define xcounter xmm7 +%define xbip xmm1 +%define xcrc xmm2 +%define xcrckey xmm3 +%define xtmp1 xmm4 +%define xtmp2 xmm5 +%define xtmp3 xmm6 +%define xtmp4 xmm8 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define tmp_1 r8 +%define tmp_2 r9 +%define tmp_3 r10 +%define tmp_4 r11 +%define tmp_5 r12 +%define tmp_6 r13 +%define tmp_7 r14 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define tmp_1 r10 +%define tmp_2 r11 +%define tmp_3 rax +%define tmp_4 r12 +%define tmp_5 r13 +%define tmp_6 r14 +%define tmp_7 r15 +%endif + +%define job arg1 + +%define p_in arg2 +%define p_keys arg3 +%define p_out arg4 + +%define num_bytes tmp_1 ; bytes to cipher +%define tmp tmp_2 +%define ctr_check tmp_3 ; counter block overflow check +%define bytes_to_crc tmp_4 ; number of bytes to CRC ( < num_bytes) + +%define ethernet_fcs tmp_6 ; not used together with tmp3 +%define tmp2 tmp_5 +%define tmp3 tmp_6 + +%define write_back_crc tmp_7 +%define decrypt_not_done tmp_7 + +;;; ============================================================================ +;;; Does all AES encryption rounds +%macro AES_ENC_ROUNDS 3 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) +%define %%BLOCK %3 ; [in/out] XMM with encrypted block + +%assign round 0 + pxor %%BLOCK, [%%KP + (round * 16)] + +%rep (%%N_ROUNDS - 1) +%assign round (round + 1) + aesenc %%BLOCK, [%%KP + (round * 16)] +%endrep + +%assign round (round + 1) + aesenclast %%BLOCK, [%%KP + (round * 16)] + +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm round on a single AES block (16 bytes): +;;; AES-CTR (optional, depending on %%CIPH) +;;; - prepares counter blocks +;;; - encrypts counter blocks +;;; - loads text +;;; - xor's text against encrypted blocks +;;; - stores cipher text +;;; BIP +;;; - BIP update on 4 x 32-bits +;;; CRC32 +;;; - CRC32 calculation +;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro +;;; behaviour can be achieved to match needs of the overall algorithm. +%macro DO_PON 15 +%define %%KP %1 ; [in] GP, pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) +%define %%CTR %3 ; [in/out] XMM with counter block +%define %%INP %4 ; [in/out] GP with input text pointer or "no_load" +%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" +%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" +%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) +%define %%XCRC_MUL %8 ; [in] XMM with CRC multiplier constant (can be anything if "no_crc" below) +%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store) +%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load) +%define %%TXMM2 %11 ; [clobbered] XMM temporary +%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc" +%define %%DIR %13 ; [in] "ENC" or "DEC" +%define %%CIPH %14 ; [in] "CTR" or "NO_CTR" +%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow) + +%ifidn %%CIPH, CTR + ;; prepare counter blocks for encryption + movdqa %%TXMM0, %%CTR + pshufb %%TXMM0, [rel byteswap_const] + ;; perform 1 increment on whole 128 bits + movdqa %%TXMM2, [rel ddq_add_1] + paddq %%CTR, %%TXMM2 + add %%CTR_CHECK, 1 + jnc %%_no_ctr_overflow + ;; Add 1 to the top 64 bits. First shift left value 1 by 64 bits. + pslldq %%TXMM2, 8 + paddq %%CTR, %%TXMM2 +%%_no_ctr_overflow: +%endif + ;; CRC calculation +%ifidn %%CRC_TYPE, next_crc + movdqa %%TXMM2, %%XCRC_IN_OUT + pclmulqdq %%TXMM2, %%XCRC_MUL, 0x01 + pclmulqdq %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 +%endif + +%ifnidn %%INP, no_load + movdqu %%TXMM1, [%%INP] +%endif + +%ifidn %%CIPH, CTR + ;; AES rounds + AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0 + + ;; xor plaintext/ciphertext against encrypted counter blocks + pxor %%TXMM0, %%TXMM1 +%else ;; CIPH = NO_CTR + ;; if no encryption needs to be done, move from input to output reg + movdqa %%TXMM0, %%TXMM1 +%endif ;; CIPH = CTR + +%ifidn %%CIPH, CTR +%ifidn %%DIR, ENC + ;; CRC calculation for ENCRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%else + ;; CRC calculation for DECRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM0 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM0 +%endif +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + ;; CRC calculation for DECRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif + +%endif ;; CIPH = CTR + + ;; store the result in the output buffer +%ifnidn %%OUTP, no_store + movdqu [%%OUTP], %%TXMM0 +%endif + + ;; update BIP value - always use cipher text for BIP +%ifidn %%DIR, ENC +%ifnidn %%XBIP_IN_OUT, no_bip + pxor %%XBIP_IN_OUT, %%TXMM0 +%endif +%else +%ifnidn %%XBIP_IN_OUT, no_bip + pxor %%XBIP_IN_OUT, %%TXMM1 +%endif +%endif ; DECRYPT + + ;; increment in/out pointers +%ifnidn %%INP, no_load + add %%INP, 16 +%endif +%ifnidn %%OUTP, no_store + add %%OUTP, 16 +%endif +%endmacro ; DO_PON + +;;; ============================================================================ +;;; CIPHER and BIP specified number of bytes +%macro CIPHER_BIP_REST 14 +%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher +%define %%DIR %2 ; [in] "ENC" or "DEC" +%define %%CIPH %3 ; [in] "CTR" or "NO_CTR" +%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer +%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer +%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys +%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state +%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block +%define %%XMMT1 %9 ; [clobbered] temporary XMM +%define %%XMMT2 %10 ; [clobbered] temporary XMM +%define %%XMMT3 %11 ; [clobbered] temporary XMM +%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow) +%define %%GPT1 %13 ; [clobbered] temporary GP +%define %%GPT2 %14 ; [clobbered] temporary GP + +%%_cipher_last_blocks: + cmp %%NUM_BYTES, 16 + jb %%_partial_block_left + + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + sub %%NUM_BYTES, 16 + jz %%_bip_done + jmp %%_cipher_last_blocks + +%%_partial_block_left: + simd_load_sse_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES + + ;; DO_PON() is not loading nor storing the data in this case: + ;; XMMT2 = data in + ;; XMMT1 = data out + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + + ;; BIP update for partial block (mask out bytes outside the message) + lea %%GPT1, [rel mask_out_top_bytes + 16] + sub %%GPT1, %%NUM_BYTES + movdqu %%XMMT3, [%%GPT1] + ;; put masked cipher text into XMMT2 for BIP update +%ifidn %%DIR, ENC + movdqa %%XMMT2, %%XMMT1 + pand %%XMMT2, %%XMMT3 +%else + pand %%XMMT2, %%XMMT3 +%endif + pxor %%XBIP_IN_OUT, %%XMMT2 + + ;; store partial bytes in the output buffer + simd_store_sse_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2 + +%%_bip_done: +%endmacro ; CIPHER_BIP_REST +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial + +%macro CRC32_REDUCE_128_TO_32 5 +%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value +%define %%XCRC %2 ; [in/clobbered] XMM with CRC +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register + +%define %%XCRCKEY %%XT3 + + ;; compute CRC of a 128-bit value + movdqa %%XCRCKEY, [rel rk5] + + ;; 64b fold + movdqa %%XT1, %%XCRC + pclmulqdq %%XT1, %%XCRCKEY, 0x00 + psrldq %%XCRC, 8 + pxor %%XCRC, %%XT1 + + ;; 32b fold + movdqa %%XT1, %%XCRC + pslldq %%XT1, 4 + pclmulqdq %%XT1, %%XCRCKEY, 0x10 + pxor %%XCRC, %%XT1 + +%%_crc_barrett: + ;; Barrett reduction + pand %%XCRC, [rel mask2] + movdqa %%XT1, %%XCRC + movdqa %%XT2, %%XCRC + movdqa %%XCRCKEY, [rel rk7] + + pclmulqdq %%XCRC, %%XCRCKEY, 0x00 + pxor %%XCRC, %%XT2 + pand %%XCRC, [rel mask] + movdqa %%XT2, %%XCRC + pclmulqdq %%XCRC, %%XCRCKEY, 0x10 + pxor %%XCRC, %%XT2 + pxor %%XCRC, %%XT1 + pextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value + not DWORD(%%CRC) +%endmacro + +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_128_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + ;; 128 to 64 bit reduction + movdqa %%K3_Q, [k3_q] + movdqa %%P_RES, [p_res] + + movdqa %%XTMP, %%XMM_IN_OUT + pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3 + pxor %%XTMP, %%XMM_IN_OUT + + pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3 + pxor %%XMM_IN_OUT, %%XTMP + + pand %%XMM_IN_OUT, [rel mask_out_top_64bits] + + ;; 64 to 32 bit reduction + movdqa %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q + pxor %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + + pclmulqdq %%XTMP, %%P_RES, 0x00 ; P + pxor %%XMM_IN_OUT, %%XTMP +%endmacro + +;; ============================================================================= +;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_64_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + movdqa %%K3_Q, [k3_q] + movdqa %%P_RES, [p_res] + + ;; 64 to 32 bit reduction + movdqa %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q + pxor %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + + pclmulqdq %%XTMP, %%P_RES, 0x00 ; P + pxor %%XMM_IN_OUT, %%XTMP +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 32-bit XGEM headers +%macro HEC_COMPUTE_32 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %4 ; [clobbered] temporary xmm register +%define %%XT2 %5 ; [clobbered] temporary xmm register +%define %%XT3 %6 ; [clobbered] temporary xmm register +%define %%XT4 %7 ; [clobbered] temporary xmm register + + mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + ;; shift out 13 bits of HEC value for CRC computation + shr DWORD(%%GT1), 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and DWORD(%%HEC_IN_OUT), 0xffff_e000 + + ;; prepare the message for CRC computation + movd %%XT1, DWORD(%%GT1) + pslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + movd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1) + and DWORD(%%HEC_IN_OUT), 1 + or DWORD(%%HEC_IN_OUT), DWORD(%%GT1) +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 64-bit XGEM headers +%macro HEC_COMPUTE_64 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register +%define %%XT4 %6 ; [clobbered] temporary xmm register + + mov %%GT1, %%HEC_IN_OUT + ;; shift out 13 bits of HEC value for CRC computation + shr %%GT1, 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000 + + ;; prepare the message for CRC computation + movq %%XT1, %%GT1 + pslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + movd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or %%GT1, %%HEC_IN_OUT + popcnt %%HEC_IN_OUT, %%GT1 + and %%HEC_IN_OUT, 1 + or %%HEC_IN_OUT, %%GT1 +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm of AES128-CTR, CRC and BIP +;;; - this is master macro that implements encrypt/decrypt API +;;; - calls other macros and directly uses registers +;;; defined at the top of the file +%macro AES128_CTR_PON 2 +%define %%DIR %1 ; [in] direction "ENC" or "DEC" +%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR" + + push r12 + push r13 + push r14 +%ifndef LINUX + push r15 +%endif + +%ifidn %%DIR, ENC + ;; by default write back CRC for encryption + mov DWORD(write_back_crc), 1 +%else + ;; mark decryption as finished + mov DWORD(decrypt_not_done), 1 +%endif + ;; START BIP (and update HEC if encrypt direction) + ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload) + ;; - convert it into LE + ;; - update HEC field in the header + ;; - convert it into BE + ;; - store back the header (with updated HEC) + ;; - start BIP + ;; (free to use tmp_1, tmp_2 and tmp_3 at this stage) + mov tmp_2, [job + _src] + add tmp_2, [job + _hash_start_src_offset_in_bytes] + mov tmp_3, [tmp_2] +%ifidn %%DIR, ENC + bswap tmp_3 ; go to LE + HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4 + mov bytes_to_crc, tmp_3 + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits + bswap tmp_3 ; go back to BE + mov [tmp_2], tmp_3 + movq xbip, tmp_3 +%else + movq xbip, tmp_3 + mov bytes_to_crc, tmp_3 + bswap bytes_to_crc ; go to LE + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits +%endif + cmp bytes_to_crc, 4 + ja %%_crc_not_zero + ;; XGEM payload shorter or equal to 4 bytes +%ifidn %%DIR, ENC + ;; Don't write Ethernet FCS on encryption + xor DWORD(write_back_crc), DWORD(write_back_crc) +%else + ;; Mark decryption as not finished + ;; - Ethernet FCS is not computed + ;; - decrypt + BIP to be done at the end + xor DWORD(decrypt_not_done), DWORD(decrypt_not_done) +%endif + mov DWORD(bytes_to_crc), 4 ; it will be zero after the sub (avoid jmp) +%%_crc_not_zero: + sub bytes_to_crc, 4 ; subtract size of the CRC itself + +%ifidn %%CIPH, CTR + ;; - read 16 bytes of IV + ;; - convert to little endian format + ;; - save least significant 8 bytes in GP register for overflow check + mov tmp, [job + _iv] + movdqu xcounter, [tmp] + pshufb xcounter, [rel byteswap_const] + movq ctr_check, xcounter +%endif + + ;; get input buffer (after XGEM header) + mov p_in, [job + _src] + add p_in, [job + _cipher_start_src_offset_in_bytes] + + ;; get output buffer + mov p_out, [job + _dst] + +%ifidn %%CIPH, CTR + ;; get key pointers + mov p_keys, [job + _aes_enc_key_expanded] +%endif + + ;; initial CRC value + movdqa xcrc, [rel init_crc_value] + + ;; load CRC constants + movdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey + + ;; get number of bytes to cipher +%ifidn %%CIPH, CTR + mov num_bytes, [job + _msg_len_to_cipher_in_bytes] +%else + ;; Message length to cipher is 0 + ;; - length is obtained from message length to hash (BIP) minus XGEM header size + mov num_bytes, [job + _msg_len_to_hash_in_bytes] + sub num_bytes, 8 +%endif + or bytes_to_crc, bytes_to_crc + jz %%_crc_done + + cmp bytes_to_crc, 32 + jae %%_at_least_32_bytes + +%ifidn %%DIR, DEC + ;; decrypt the buffer first + mov tmp, num_bytes + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + ;; correct in/out pointers - go back to start of the buffers + mov tmp, num_bytes + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp +%endif ; DECRYPTION + + ;; less than 32 bytes + cmp bytes_to_crc, 16 + je %%_exact_16_left + jl %%_less_than_16_left + ;; load the plaintext +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in] +%else + movdqu xtmp1, [p_out] +%endif + pxor xcrc, xtmp1 ; xor the initial crc value + jmp %%_crc_two_xmms + +%%_exact_16_left: +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in] +%else + movdqu xtmp1, [p_out] +%endif + pxor xcrc, xtmp1 ; xor the initial CRC value + jmp %%_128_done + +%%_less_than_16_left: +%ifidn %%DIR, ENC + simd_load_sse_15_1 xtmp1, p_in, bytes_to_crc +%else + simd_load_sse_15_1 xtmp1, p_out, bytes_to_crc +%endif + pxor xcrc, xtmp1 ; xor the initial CRC value + + lea tmp, [rel pshufb_shf_table] + movdqu xtmp1, [tmp + bytes_to_crc] + pshufb xcrc, xtmp1 + jmp %%_128_done + +%%_at_least_32_bytes: + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 + +%%_main_loop: + cmp bytes_to_crc, 16 + jb %%_exit_loop + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 +%ifidn %%DIR, ENC + jz %%_128_done +%endif + jmp %%_main_loop + +%%_exit_loop: + +%ifidn %%DIR, DEC + ;; decrypt rest of the message including CRC and optional padding + mov tmp, num_bytes + + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp + + or bytes_to_crc, bytes_to_crc + jz %%_128_done +%endif ; DECRYPTION + + ;; Partial bytes left - complete CRC calculation +%%_crc_two_xmms: + lea tmp, [rel pshufb_shf_table] + movdqu xtmp2, [tmp + bytes_to_crc] +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%else + movdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%endif + movdqa xtmp3, xcrc + pshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc + pxor xtmp2, [rel mask3] + pshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc + + ;; data num_bytes (top) blended with MSB bytes of CRC (bottom) + movdqa xmm0, xtmp2 + pblendvb xtmp3, xtmp1 ; xmm0 implicit + + ;; final CRC calculation + movdqa xtmp1, xcrc + pclmulqdq xtmp1, xcrckey, 0x01 + pclmulqdq xcrc, xcrckey, 0x10 + pxor xcrc, xtmp3 + pxor xcrc, xtmp1 + +%%_128_done: + CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey + +%%_crc_done: + ;; @todo - store-to-load problem in ENC case (to be fixed later) + ;; - store CRC in input buffer and authentication tag output + ;; - encrypt remaining bytes +%ifidn %%DIR, ENC + or DWORD(write_back_crc), DWORD(write_back_crc) + jz %%_skip_crc_write_back + mov [p_in + bytes_to_crc], DWORD(ethernet_fcs) +%%_skip_crc_write_back: +%endif + mov tmp, [job + _auth_tag_output] + mov [tmp + 4], DWORD(ethernet_fcs) + + or num_bytes, num_bytes + jz %%_do_not_cipher_the_rest + + ;; encrypt rest of the message + ;; - partial bytes including CRC and optional padding + ;; decrypt rest of the message + ;; - this may only happen when XGEM payload is short and padding is added +%ifidn %%DIR, DEC + or DWORD(decrypt_not_done), DWORD(decrypt_not_done) + jnz %%_do_not_cipher_the_rest +%endif + CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 +%%_do_not_cipher_the_rest: + + ;; finalize BIP + movdqa xtmp1, xbip + movdqa xtmp2, xbip + movdqa xtmp3, xbip + psrldq xtmp1, 4 + psrldq xtmp2, 8 + psrldq xtmp3, 12 + pxor xtmp1, xtmp2 + pxor xbip, xtmp3 + pxor xbip, xtmp1 + movd [tmp], xbip + + ;; set job status + or dword [job + _status], STS_COMPLETED + + ;; return job + mov rax, job + +%ifndef LINUX + pop r15 +%endif + pop r14 + pop r13 + pop r12 +%endmacro ; AES128_CTR_PON + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; aes_cntr_128_pon_enc_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(ENC_FN_NAME,function,internal) +ENC_FN_NAME: + AES128_CTR_PON ENC, CTR + ret + +;;; aes_cntr_128_pon_dec_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(DEC_FN_NAME,function,internal) +DEC_FN_NAME: + AES128_CTR_PON DEC, CTR + ret + +;;; aes_cntr_128_pon_enc_no_ctr_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(ENC_NO_CTR_FN_NAME,function,internal) +ENC_NO_CTR_FN_NAME: + AES128_CTR_PON ENC, NO_CTR + ret + +;;; aes_cntr_128_pon_dec_no_ctr_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(DEC_NO_CTR_FN_NAME,function,internal) +DEC_NO_CTR_FN_NAME: + AES128_CTR_PON DEC, NO_CTR + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm new file mode 100644 index 000000000..355a38906 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm @@ -0,0 +1,435 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +;; code to compute quad SHA1 using SSE +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact +;; rbx, rsi, rdi, rbp, r12-r15 left intact +;; This version is not safe to call from C/C++ + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + paddd %%regE,[rsp + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [rsp + ((%%memW - 8) & 15) * 16] + pxor W16, [rsp + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ must be an odd multiple of 8 +%define FRAMESZ 16*16 + 8 + +%define MOVPS movdqu + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +align 32 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks); +; arg 1 : rcx : pointer to args +; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 +MKGLOBAL(sha1_mult_sse,function,internal) +sha1_mult_sse: + + sub rsp, FRAMESZ + + ;; Initialize digests + movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] + movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] + movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] + movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] + movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] + DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E + ;; load input pointers + mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] + mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] + mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] + mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] + DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3 + xor IDX, IDX +lloop: + movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + MOVPS T2,[inp0+IDX] + MOVPS T1,[inp1+IDX] + MOVPS T4,[inp2+IDX] + MOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3 + pshufb T0, F + movdqa [rsp+(I*4+0)*16],T0 + pshufb T1, F + movdqa [rsp+(I*4+1)*16],T1 + pshufb T2, F + movdqa [rsp+(I*4+2)*16],T2 + pshufb T3, F + movdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E + +;; +;; perform 0-79 steps +;; + movdqa K, [rel K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + movdqa W16, [rsp + ((16 - 16) & 15) * 16] + movdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + movdqa K, [rel K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + movdqa K, [rel K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + movdqa K, [rel K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + paddd A,AA + paddd B,BB + paddd C,CC + paddd D,DD + paddd E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A + movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B + movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C + movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D + movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E + DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 + DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3 + ;;;;;;;;;;;;;;;; + ;; Postamble + + ;; Clear stack frame (16*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep 16 + movdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, FRAMESZ + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm new file mode 100644 index 000000000..c02c88eed --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm @@ -0,0 +1,493 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RDX R10 R11 +;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RDI R10 R11 +;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 + +%include "include/os.asm" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +; reso = resdq => 16 bytes +struc frame +.ABCD_SAVE reso 1 +.E_SAVE reso 1 +.ABCD_SAVEb reso 1 +.E_SAVEb reso 1 +.align resq 1 +endstruc + +%define INP r10 +%define INPb r11 + +%define ABCD xmm0 +%define E0 xmm1 ; Need two E's b/c they ping pong +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 + +%define ABCDb xmm7 +%define E0b xmm8 ; Need two E's b/c they ping pong +%define E1b xmm9 +%define MSG0b xmm10 +%define MSG1b xmm11 +%define MSG2b xmm12 +%define MSG3b xmm13 + +%define SHUF_MASK xmm14 +%define E_MASK xmm15 + +section .data +default rel +align 64 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f + dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000 + dq 0x0000000000000000, 0xFFFFFFFF00000000 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 + +section .text +MKGLOBAL(sha1_ni,function,internal) +align 32 +sha1_ni: + sub rsp, frame_size + + DBGPRINTL "enter sha1-ni-x2" + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + + ;; load input pointers + mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ] + DBGPRINTL64 "jobA: pointer", INP + mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ] + + add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition + + ;; load initial digest + movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE] + pxor E0, E0 + pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCD, ABCD, 0x1B + + DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest in word 4", E0 + + movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE] + pxor E0b, E0b + pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCDb, ABCDb, 0x1B + + movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + movdqa E_MASK, [rel UPPER_WORD_MASK] + + DBGPRINTL "jobA data:" +loop0: + ;; Copy digests + movdqa [rsp + frame.ABCD_SAVE], ABCD + movdqa [rsp + frame.E_SAVE], E0 + movdqa [rsp + frame.ABCD_SAVEb], ABCDb + movdqa [rsp + frame.E_SAVEb], E0b + + ;; Only needed if not using sha1nexte for rounds 0-3 + pand E0, E_MASK + pand E0b, E_MASK + + ;; Needed if using sha1nexte for rounds 0-3 + ;; Need to rotate E right by 30 + ;movdqa E1, E0 + ;psrld E0, 30 + ;pslld E1, 2 + ;pxor E0, E1 + + ;; Rounds 0-3 + movdqu MSG0, [INP + 0*16] + pshufb MSG0, SHUF_MASK + DBGPRINT_XMM MSG0 + ;sha1nexte E0, MSG0 + paddd E0, MSG0 ; instead of sha1nexte + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + movdqu MSG0b, [INPb + 0*16] + pshufb MSG0b, SHUF_MASK + ;sha1nexte E0b, MSG0b + paddd E0b, MSG0b ; instead of sha1nexte + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + + ;; Rounds 4-7 + movdqu MSG1, [INP + 1*16] + pshufb MSG1, SHUF_MASK + DBGPRINT_XMM MSG1 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + movdqu MSG1b, [INPb + 1*16] + pshufb MSG1b, SHUF_MASK + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG0b, MSG1b + + ;; Rounds 8-11 + movdqu MSG2, [INP + 2*16] + pshufb MSG2, SHUF_MASK + DBGPRINT_XMM MSG2 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + movdqu MSG2b, [INPb + 2*16] + pshufb MSG2b, SHUF_MASK + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 12-15 + movdqu MSG3, [INP + 3*16] + pshufb MSG3, SHUF_MASK + DBGPRINT_XMM MSG3 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + movdqu MSG3b, [INPb + 3*16] + pshufb MSG3b, SHUF_MASK + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + + ;; Rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 3 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 3 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 3 + pxor MSG3b, MSG1b + + ;; Rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 3 + + ;; Rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 3 + + ;; Need to rotate E left by 30 + movdqa E1, E0 + pslld E0, 30 + psrld E1, 2 + pxor E0, E1 + movdqa E1b, E0b + pslld E0b, 30 + psrld E1b, 2 + pxor E0b, E1b + + paddd ABCD, [rsp + frame.ABCD_SAVE] + paddd E0, [rsp + frame.E_SAVE] + paddd ABCDb, [rsp + frame.ABCD_SAVEb] + paddd E0b, [rsp + frame.E_SAVEb] + + add INP, 64 + add INPb, 64 + cmp INP, NUM_BLKS + jne loop0 + + ;; write out digests + pshufd ABCD, ABCD, 0x1B + movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD + pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3 + DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest out word 4", E0 + + pshufd ABCDb, ABCDb, 0x1B + movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb + pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3 + + ;; update input pointers + mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb + +done_hash: + + ;; Clear stack frame (4*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep 4 + movdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, frame_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm new file mode 100644 index 000000000..9039660cc --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm @@ -0,0 +1,512 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; SHA1 code, hybrid, rolled, interleaved +; Uses SSE instructions +%include "include/os.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%define MOVDQ movdqu ;; assume buffers not aligned + +%ifdef LINUX +%define INP rdi ; 1st arg +%define CTX rsi ; 2nd arg +%define REG3 edx +%define REG4 ecx +%else +%define INP rcx ; 1st arg +%define CTX rdx ; 2nd arg +%define REG3 edi +%define REG4 esi +%endif + +%define FRAMESZ 3*16 + 1*8 +%define _RSP FRAMESZ-1*8 + rsp + +%define a eax +%define b ebx +%define c REG3 +%define d REG4 +%define e r8d +%define T1 r9d +%define f r10d +%define RND r11d +%define g r12d +%define h r13d + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XK xmm2 + +%xdefine X0 xmm3 +%xdefine X1 xmm4 +%xdefine X2 xmm5 +%xdefine X3 xmm6 +%xdefine X4 xmm7 + +%define XFER xmm8 + +%define SZ 4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X_ +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regC + xor %%regF,%%regD + and %%regF,%%regB + xor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regD + xor %%regF,%%regC + xor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regB + mov %%regT,%%regB + or %%regF,%%regC + and %%regT,%%regC + and %%regF,%%regD + or %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +;; input is T1 +%macro ROUND 1 +%define %%MAGIC %1 + add e,T1 + mov T1,a + rol T1,5 + add e,T1 + %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + rol b,30 + add h,e +ROTATE_ARGS +%endmacro + +%macro do_4i 1 + movdqa XFER, XK + paddd XFER, X0 + pextrd T1, XFER, 0 + ;ROUND %1 + add e,T1 + ;SCHEDULE_4 + movdqa XTMP0, X1 + palignr XTMP0, X0, 8 ; XTMP0 = W[-14] + mov T1,a + movdqa XTMP1, X2 + rol T1,5 + pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16] + add e,T1 + pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + + ;; Finish low half + movdqa X4, X3 + rol b,30 + psrldq X4, 4 ; X4 = W[-3] {xxBA} + add h,e +ROTATE_ARGS + pextrd T1, XFER, 1 + ;ROUND %1 + add e,T1 + pxor X4, XTMP0 ; + mov T1,a + movdqa XTMP1, X4 + rol T1,5 + ;; rotate X4 left 1 + psrld XTMP1, (32-1) + add e,T1 + pslld X4, 1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + pxor X4, XTMP1 ; X4 = W[0] {xxBA} + rol b,30 + add h,e +ROTATE_ARGS + pextrd T1, XFER, 2 + ;ROUND %1 + add e,T1 + movdqa XTMP1, X4 + mov T1,a + + ;; Finish high half + palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx} + rol T1,5 + add e,T1 + pxor XTMP0, XTMP1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; rotate XTMP0 left 1 + movdqa XTMP1, XTMP0 + psrld XTMP1, (32-1) + rol b,30 + add h,e +ROTATE_ARGS + pextrd T1, XFER, 3 + ;ROUND %1 + add e,T1 + mov T1,a + pslld XTMP0, 1 + rol T1,5 + add e,T1 + pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; COMBINE HALVES + shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} + rol b,30 + add h,e + + rotate_Xs +ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_block_sse(void *input_data, UINT32 digest[5]) +;; arg 1 : (in) pointer to one block of data +;; arg 2 : (in/out) pointer to read/write digest +MKGLOBAL(sha1_block_sse,function,internal) +align 32 +sha1_block_sse: + push rbx + push rsi + push rdi + push r12 + push r13 + + movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] + +%ifndef LINUX + mov rax, rsp ; copy rsp + sub rsp, FRAMESZ + and rsp, -16 ; align stack frame + mov [_RSP],rax ; save copy of rsp + movdqa [rsp + 0 * 16], xmm6 + movdqa [rsp + 1 * 16], xmm7 + movdqa [rsp + 2 * 16], xmm8 + +%endif + MOVDQ X0, [INP + 0*16] + MOVDQ X1, [INP + 1*16] + + ;; load next message block + MOVDQ X2, [INP + 2*16] + MOVDQ X3, [INP + 3*16] + + ;; set up a-f based on h0-h4 + ;; byte swap first 16 dwords + mov a, [SZ*0 + CTX] + pshufb X0, XTMP0 + mov b, [SZ*1 + CTX] + pshufb X1, XTMP0 + mov c, [SZ*2 + CTX] + pshufb X2, XTMP0 + mov d, [SZ*3 + CTX] + pshufb X3, XTMP0 + mov e, [SZ*4 + CTX] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K00_19] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop1_5 +align 16 +loop1: + + do_4i MAGIC_F0 + +loop1_5: + do_4i MAGIC_F0 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K20_39] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop2_5 +align 16 +loop2: + + do_4i MAGIC_F1 + +loop2_5: + do_4i MAGIC_F1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K40_59] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop3_5 +align 16 +loop3: + + do_4i MAGIC_F2 + +loop3_5: + do_4i MAGIC_F2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop3 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 60-79 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K60_79] + + do_4i MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X0 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X1 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X2 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X3 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + ;; update result digest h0-h4 + add [SZ*0 + CTX], a + add [SZ*1 + CTX], b + add [SZ*2 + CTX], c + add [SZ*3 + CTX], d + add [SZ*4 + CTX], e + +%ifndef LINUX + movdqa xmm8, [rsp + 2 * 16] + movdqa xmm7, [rsp + 1 * 16] + movdqa xmm6, [rsp + 0 * 16] + +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + pxor xmm0, xmm0 + movdqa [rsp + 0 * 16], xmm0 + movdqa [rsp + 1 * 16], xmm0 + movdqa [rsp + 2 * 16], xmm0 +%endif + + mov rsp, [_RSP] +%endif ;; LINUX + + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm new file mode 100644 index 000000000..f0914d799 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm @@ -0,0 +1,33 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define FUNC sha224_block_sse + +%include "sse/sha256_one_block_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm new file mode 100644 index 000000000..fa593defa --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm @@ -0,0 +1,614 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RCX RDX RSI RDI R11 +;; Windows preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RCX RDX RSI RDI R11 +;; Linux preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 + +%include "include/os.asm" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +; resdq = res0 => 16 bytes +struc frame +.ABEF_SAVE reso 1 +.CDGH_SAVE reso 1 +.ABEF_SAVEb reso 1 +.CDGH_SAVEb reso 1 +.align resq 1 +endstruc + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +%define INP arg3 +%define INPb arg4 + + +%define SHA256CONSTANTS r11 + +;; MSG MUST be xmm0 (implicit argument) +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define STATE0b xmm8 +%define STATE1b xmm9 +%define MSGTMP0b xmm10 +%define MSGTMP1b xmm11 +%define MSGTMP2b xmm12 +%define MSGTMP3b xmm13 +%define MSGTMP xmm14 + +%define SHUF_MASK xmm15 + +section .data +default rel +align 64 +K256: + dd 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + dd 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + dd 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + dd 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + dd 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + dd 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + dd 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + dd 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + dd 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + dd 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + dd 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + dd 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + dd 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_ni(SHA256_ARGS *args, UINT32 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 +section .text +MKGLOBAL(sha256_ni,function,internal) +align 32 +sha256_ni: + sub rsp, frame_size + + DBGPRINTL "enter sha256-ni-x2" + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + + DBGPRINTL64 "jobA/B byte size:", NUM_BLKS + + ;; load input pointers + mov INP, [args + _data_ptr_sha256 + 0*PTR_SZ] + mov INPb, [args + _data_ptr_sha256 + 1*PTR_SZ] + + add NUM_BLKS, INP ; pointer to end of data + + ;; load initial digest + ;; Probably need to reorder these appropriately + ;; DCBA, HGFE -> ABEF, CDGH + + movdqu STATE0, [args + 0*SHA256NI_DIGEST_ROW_SIZE] + movdqu STATE1, [args + 0*SHA256NI_DIGEST_ROW_SIZE + 16] + movdqu STATE0b, [args + 1*SHA256NI_DIGEST_ROW_SIZE] + movdqu STATE1b, [args + 1*SHA256NI_DIGEST_ROW_SIZE + 16] + DBGPRINTL "jobA digest in:" + DBGPRINT_XMM STATE0 + DBGPRINT_XMM STATE1 + DBGPRINTL "jobB digest in:" + DBGPRINT_XMM STATE0b + DBGPRINT_XMM STATE1b + + pshufd STATE0, STATE0, 0xB1 ; CDAB + pshufd STATE1, STATE1, 0x1B ; EFGH + movdqa MSGTMP4, STATE0 + pshufd STATE0b, STATE0b, 0xB1 ; CDAB + pshufd STATE1b, STATE1b, 0x1B ; EFGH + movdqa MSGTMP, STATE0b + palignr STATE0, STATE1, 8 ; ABEF + palignr STATE0b, STATE1b, 8 ; ABEF + pblendw STATE1, MSGTMP4, 0xF0 ; CDGH + pblendw STATE1b, MSGTMP, 0xF0 ; CDGH + + lea SHA256CONSTANTS,[rel K256] + movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + +%ifdef DO_DBGPRINT + ;; prin buffer A + push r10 + push NUM_BLKS + DBGPRINTL "jobA data:" + xor r10, r10 + sub NUM_BLKS, INP +.loop_dbgA: + movdqu MSG, [INP + r10 + 0*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 1*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 2*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 3*16] + DBGPRINT_XMM MSG + add r10, 64 + cmp NUM_BLKS, r10 + jne .loop_dbgA + pop NUM_BLKS + pop r10 +%endif + +%ifdef DO_DBGPRINT + ;; prin buffer B + push r10 + push NUM_BLKS + DBGPRINTL "jobB data:" + xor r10, r10 + sub NUM_BLKS, INP +.loop_dbgB: + movdqu MSG, [INPb + r10 + 0*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 1*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 2*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 3*16] + DBGPRINT_XMM MSG + add r10, 64 + cmp NUM_BLKS, r10 + jne .loop_dbgB + pop NUM_BLKS + pop r10 +%endif + +.loop0: + ;; Save digests + movdqa [rsp + frame.ABEF_SAVE], STATE0 + movdqa [rsp + frame.CDGH_SAVE], STATE1 + movdqa [rsp + frame.ABEF_SAVEb], STATE0b + movdqa [rsp + frame.CDGH_SAVEb], STATE1b + + ;; Rounds 0-3 + movdqu MSG, [INP + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [SHA256CONSTANTS + 0*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0b, MSG + paddd MSG, [SHA256CONSTANTS + 0*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 4-7 + movdqu MSG, [INP + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [SHA256CONSTANTS + 1*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1b, MSG + paddd MSG, [SHA256CONSTANTS + 1*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 8-11 + movdqu MSG, [INP + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [SHA256CONSTANTS + 2*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2b, MSG + paddd MSG, [SHA256CONSTANTS + 2*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 12-15 + movdqu MSG, [INP + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [SHA256CONSTANTS + 3*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3b, MSG + paddd MSG, [SHA256CONSTANTS + 3*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 16-19 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 4*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 4*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 20-23 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 5*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 5*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 24-27 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 6*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 6*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 28-31 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 7*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 7*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 32-35 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 8*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 8*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 36-39 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 9*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 9*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 40-43 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 10*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 10*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 44-47 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 11*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 11*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 48-51 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 12*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 12*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 52-55 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 13*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 13*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 56-59 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 14*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 14*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 60-63 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 15*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 15*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + paddd STATE0, [rsp + frame.ABEF_SAVE] + paddd STATE1, [rsp + frame.CDGH_SAVE] + paddd STATE0b, [rsp + frame.ABEF_SAVEb] + paddd STATE1b, [rsp + frame.CDGH_SAVEb] + + add INP, 64 + add INPb, 64 + cmp INP, NUM_BLKS + jne .loop0 + + ;; update data pointers + mov [args + _data_ptr_sha256 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha256 + 1*PTR_SZ], INPb + + ; Reorder for writeback + pshufd STATE0, STATE0, 0x1B ; FEBA + pshufd STATE1, STATE1, 0xB1 ; DCHG + movdqa MSGTMP4, STATE0 + pshufd STATE0b, STATE0b, 0x1B ; FEBA + pshufd STATE1b, STATE1b, 0xB1 ; DCHG + movdqa MSGTMP, STATE0b + pblendw STATE0, STATE1, 0xF0 ; DCBA + pblendw STATE0b, STATE1b, 0xF0 ; DCBA + palignr STATE1, MSGTMP4, 8 ; HGFE + palignr STATE1b, MSGTMP, 8 ; HGFE + + ;; update digests + movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0 + movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1 + movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0b + movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1b + + DBGPRINTL "jobA digest out:" + DBGPRINT_XMM STATE0 + DBGPRINT_XMM STATE1 + DBGPRINTL "jobB digest out:" + DBGPRINT_XMM STATE0b + DBGPRINT_XMM STATE1b + +done_hash: + DBGPRINTL "exit sha256-ni-x2" + + ;; Clear stack frame (4*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep 4 + movdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, frame_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm new file mode 100644 index 000000000..8869c14ef --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm @@ -0,0 +1,512 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "include/os.asm" + +section .data +default rel +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF +; shuffle xDxC -> DC00 +_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 + +section .text + + +%define MOVDQ movdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 7 +%endif +_XFER: reso 1 +endstruc + +%ifndef FUNC +%define FUNC sha256_block_sse +%endif + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT32 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,internal) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 +%endif + + ;; load initial digest + mov a, [4*0 + CTX] + mov b, [4*1 + CTX] + mov c, [4*2 + CTX] + mov d, [4*3 + CTX] + mov e, [4*4 + CTX] + mov f, [4*5 + CTX] + mov g, [4*6 + CTX] + mov h, [4*7 + CTX] + + movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + movdqa SHUF_00BA, [rel _SHUF_00BA] + movdqa SHUF_DC00, [rel _SHUF_DC00] + + lea TBL,[rel K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + add [4*0 + CTX], a + add [4*1 + CTX], b + add [4*2 + CTX], c + add [4*3 + CTX], d + add [4*4 + CTX], e + add [4*5 + CTX], f + add [4*6 + CTX], g + add [4*7 + CTX], h + +done_hash: +%ifndef LINUX + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + pxor xmm0, xmm0 + movdqa [rsp + _XMM_SAVE + 0 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 1 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 2 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 3 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 4 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 5 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 6 * 16], xmm0 +%endif +%endif ;; LINUX + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm new file mode 100644 index 000000000..c95f89d8f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm @@ -0,0 +1,33 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define FUNC sha384_block_sse + +%include "sse/sha512_one_block_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm new file mode 100644 index 000000000..534cfbfd8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm @@ -0,0 +1,480 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "include/os.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned + +%ifndef FUNC +%define FUNC sha512_block_sse +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 +%define X4 xmm8 +%define X5 xmm9 +%define X6 xmm10 +%define X7 xmm11 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XFER xmm13 + +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c rcx +%define d r8 +%define e rdx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c rdi +%define d rsi +%define e r8 + +%endif +%define TBL rbp +%define a rax +%define b rbx + +%define f r9 +%define g r10 +%define h r11 + +%define y0 r13 +%define y1 r14 +%define y2 r15 + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 8 +%endif +_XFER: reso 1 +endstruc + +; rotate_Xs +; Rotate values of symbols X0...X7 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X5 +%xdefine X5 X6 +%xdefine X6 X7 +%xdefine X7 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro TWO_ROUNDS_AND_SCHED 0 + + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X5 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (41-18) ; y0 = e >> (41-18) + palignr XTMP0, X4, 8 ; XTMP0 = W[-7] + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + ror y1, (39-34) ; y1 = a >> (39-34) + xor y1, a ; y1 = a ^ (a >> (39-34) + movdqa XTMP1, X1 + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + paddq XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + ;; compute s0 + palignr XTMP1, X0, 8 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + psllq XTMP1, (64-1) + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + psrlq XTMP2, 1 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + psrlq XTMP2, 8 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + movdqa X0, XTMP3 ; X0 = W[-15] + psllq XTMP3, (64-8) + + +ROTATE_ARGS + pxor XTMP1, XTMP3 + psrlq X0, 7 ; X0 = W[-15] >> 7 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 + ror y0, (41-18) ; y0 = e >> (41-18) + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + pxor XTMP1, X0 ; XTMP1 = s0 + ror y1, (39-34) ; y1 = a >> (39-34) + xor y1, a ; y1 = a ^ (a >> (39-34) + ;; compute s1 + movdqa XTMP2, X7 ; XTMP2 = W[-2] + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + paddq XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] + movdqa X0, XTMP2 ; X0 = W[-2] + and y2, e ; y2 = (f^g)&e + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + psllq XTMP3, (64-19) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH + psrlq X0, 19 + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + por XTMP3, X0 ; XTMP3 = W[-2] ror 19 + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + movdqa X0, XTMP2 ; X0 = W[-2] + movdqa XTMP1, XTMP2 ; XTMP1 = W[-2] + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + psllq X0, (64-61) + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + psrlq XTMP1, 61 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + por X0, XTMP1 ; X0 = W[-2] ror 61 + psrlq XTMP2, 6 ; XTMP2 = W[-2] >> 6 + pxor XTMP2, XTMP3 + pxor X0, XTMP2 ; X0 = s1 + paddq X0, XTMP0 ; X0 = {W[1], W[0]} + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 8] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (41-18) ; y0 = e >> (41-18) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (41-18)) + ror y1, (39-34) ; y1 = a >> (39-34) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (39-34) + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + ROTATE_ARGS +%endm + +section .data +default rel +align 64 +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT64 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,internal) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 + movdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a, [8*0 + CTX] + mov b, [8*1 + CTX] + mov c, [8*2 + CTX] + mov d, [8*3 + CTX] + mov e, [8*4 + CTX] + mov f, [8*5 + CTX] + mov g, [8*6 + CTX] + mov h, [8*7 + CTX] + + movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + + lea TBL,[rel K512] + + ;; byte swap first 16 qwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK + + ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds + mov SRND, 4 +align 16 +loop1: + +%assign i 0 +%rep 7 + movdqa XFER, X0 + paddq XFER, [TBL + i*16] + movdqa [rsp + _XFER], XFER + TWO_ROUNDS_AND_SCHED +%assign i (i+1) +%endrep + + movdqa XFER, X0 + paddq XFER, [TBL + 7*16] + movdqa [rsp + _XFER], XFER + add TBL, 8*16 + TWO_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 + jmp loop2a +loop2: + movdqa X0, X4 + movdqa X1, X5 + movdqa X2, X6 + movdqa X3, X7 + +loop2a: + paddq X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + + paddq X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + DO_ROUND 0 + DO_ROUND 1 + + paddq X2, [TBL + 2*16] + movdqa [rsp + _XFER], X2 + DO_ROUND 0 + DO_ROUND 1 + + paddq X3, [TBL + 3*16] + movdqa [rsp + _XFER], X3 + add TBL, 4*16 + DO_ROUND 0 + DO_ROUND 1 + + sub SRND, 1 + jne loop2 + + add [8*0 + CTX], a + add [8*1 + CTX], b + add [8*2 + CTX], c + add [8*3 + CTX], d + add [8*4 + CTX], e + add [8*5 + CTX], f + add [8*6 + CTX], g + add [8*7 + CTX], h + +done_hash: +%ifndef LINUX + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] + movdqa xmm13,[rsp + _XMM_SAVE + 7*16] + +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + pxor xmm0, xmm0 + movdqa [rsp + _XMM_SAVE + 0 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 1 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 2 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 3 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 4 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 5 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 6 * 16], xmm0 + movdqa [rsp + _XMM_SAVE + 7 * 16], xmm0 +%endif +%endif ;; LINUX + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm new file mode 100644 index 000000000..77043f29f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm @@ -0,0 +1,449 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute SHA512 by-2 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +section .data +default rel +align 64 +MKGLOBAL(K512_2,data,internal) +K512_2: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifdef LINUX ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPD movupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + movapd %%t0, %%r0 ; t0 = a1 a0 + shufpd %%r0, %%r1, 00b ; r0 = b0 a0 + shufpd %%t0, %%r1, 11b ; t0 = b1 a1 +%endm + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psllq %%tmp, (64-(%%imm)) + psrlq %%reg, %%imm + por %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORQ a0, (18-14) ; sig1: a0 = (e >> 4) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORQ a1, 41 ; sig1: a1 = (e >> 41) + movdqa [SZ2*(%%i&0xf) + rsp],%%T1 + paddq %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + paddq h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORQ a2, (34-28) ; sig0: a2 = (a >> 6) + paddq h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORQ a1, 39 ; sig0: a1 = (a >> 39) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddq h, a0 + + paddq d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddq h, a1 ; h = h + ch + W + K + maj + paddq h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ2*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORQ %%T1, 8-1 + movdqa a2, a1 + PRORQ a1, 61-19 + pxor %%T1, a0 + PRORQ %%T1, 1 + pxor a1, a2 + PRORQ a1, 19 + psrlq a0, 7 + pxor %%T1, a0 + psrlq a2, 6 + pxor a1, a2 + paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp] + paddq a1, [SZ2*((%%i-7)&0xf) + rsp] + paddq %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + + + +;; SHA512_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[2]; +;; + +;; void sha512_x2_sse(SHA512_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha512_x2_sse,function,internal) +align 32 +sha512_x2_sse: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + DBGPRINTL_XMM "incoming transposed sha512 digest", a, b, c, d, e, f, g, h + lea TBL,[rel K512_2] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + DBGPRINTL64 "lloop enter INP_SIZE ", INP_SIZE + DBGPRINTL64 " IDX = ", IDX + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ2], a + movdqa [rsp + _DIGEST + 1*SZ2], b + movdqa [rsp + _DIGEST + 2*SZ2], c + movdqa [rsp + _DIGEST + 3*SZ2], d + movdqa [rsp + _DIGEST + 4*SZ2], e + movdqa [rsp + _DIGEST + 5*SZ2], f + movdqa [rsp + _DIGEST + 6*SZ2], g + movdqa [rsp + _DIGEST + 7*SZ2], h + + DBGPRINTL "incoming data[" +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + MOVPD TT2,[inp1+IDX+i*16] + DBGPRINTL_XMM "input message block", TT0 + TRANSPOSE TT0, TT2, TT1 + pshufb TT0, TMP + pshufb TT1, TMP + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + DBGPRINTL "]" + add IDX, 8 * 16 ;; increment by a message block + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddq a, [rsp + _DIGEST + 0*SZ2] + paddq b, [rsp + _DIGEST + 1*SZ2] + paddq c, [rsp + _DIGEST + 2*SZ2] + paddq d, [rsp + _DIGEST + 3*SZ2] + paddq e, [rsp + _DIGEST + 4*SZ2] + paddq f, [rsp + _DIGEST + 5*SZ2] + paddq g, [rsp + _DIGEST + 6*SZ2] + paddq h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a + movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b + movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c + movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d + movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e + movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f + movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g + movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h + DBGPRINTL_XMM "exit transposed digest ", a, b, c, d, e, f, g, h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + ;; Clear stack frame ((16 + 8)*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep (16+NUM_SHA512_DIGEST_WORDS) + movdqa [rsp + i*SZ2], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, STACK_size +DBGPRINTL "====================== exit sha512_x2_sse code =====================\n" + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm new file mode 100644 index 000000000..954d6597e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm @@ -0,0 +1,457 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute quad SHA256 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 +;; +;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +section .data +default rel +align 64 +MKGLOBAL(K256_4,data,internal) +K256_4: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifdef LINUX ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r12 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ4 + +; Define stack usage +struc STACK +_DATA: resb SZ4 * 16 +_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPS movups + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + rsp],%%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp] + paddd a1, [SZ4*((%%i-7)&0xf) + rsp] + paddd %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + + + +;; SHA256_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[4]; +;; + +;; void sha_256_mult_sse(SHA256_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha_256_mult_sse,function,internal) +align 32 +sha_256_mult_sse: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA256_DIGEST_ROW_SIZE ] + movdqa b,[STATE + 1 * SHA256_DIGEST_ROW_SIZE ] + movdqa c,[STATE + 2 * SHA256_DIGEST_ROW_SIZE ] + movdqa d,[STATE + 3 * SHA256_DIGEST_ROW_SIZE ] + movdqa e,[STATE + 4 * SHA256_DIGEST_ROW_SIZE ] + movdqa f,[STATE + 5 * SHA256_DIGEST_ROW_SIZE ] + movdqa g,[STATE + 6 * SHA256_DIGEST_ROW_SIZE ] + movdqa h,[STATE + 7 * SHA256_DIGEST_ROW_SIZE ] + + DBGPRINTL_XMM "incoming transposed sha256 digest", a, b, c, d, e, f, g, h + lea TBL,[rel K256_4] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] + DBGPRINTL64 "incoming input data ptrs ", inp0, inp1, inp2, inp3 + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ4], a + movdqa [rsp + _DIGEST + 1*SZ4], b + movdqa [rsp + _DIGEST + 2*SZ4], c + movdqa [rsp + _DIGEST + 3*SZ4], d + movdqa [rsp + _DIGEST + 4*SZ4], e + movdqa [rsp + _DIGEST + 5*SZ4], f + movdqa [rsp + _DIGEST + 6*SZ4], g + movdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + MOVPS TT2,[inp0+IDX+i*16] + MOVPS TT1,[inp1+IDX+i*16] + MOVPS TT4,[inp2+IDX+i*16] + MOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + pshufb TT0, TMP + pshufb TT1, TMP + pshufb TT2, TMP + pshufb TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddd a, [rsp + _DIGEST + 0*SZ4] + paddd b, [rsp + _DIGEST + 1*SZ4] + paddd c, [rsp + _DIGEST + 2*SZ4] + paddd d, [rsp + _DIGEST + 3*SZ4] + paddd e, [rsp + _DIGEST + 4*SZ4] + paddd f, [rsp + _DIGEST + 5*SZ4] + paddd g, [rsp + _DIGEST + 6*SZ4] + paddd h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE+0*SHA256_DIGEST_ROW_SIZE ],a + movdqa [STATE+1*SHA256_DIGEST_ROW_SIZE ],b + movdqa [STATE+2*SHA256_DIGEST_ROW_SIZE ],c + movdqa [STATE+3*SHA256_DIGEST_ROW_SIZE ],d + movdqa [STATE+4*SHA256_DIGEST_ROW_SIZE ],e + movdqa [STATE+5*SHA256_DIGEST_ROW_SIZE ],f + movdqa [STATE+6*SHA256_DIGEST_ROW_SIZE ],g + movdqa [STATE+7*SHA256_DIGEST_ROW_SIZE ],h + DBGPRINTL_XMM "updated transposed sha256 digest", a, b, c, d, e, f, g, h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha256 + 0*8], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha256 + 1*8], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha256 + 2*8], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha256 + 3*8], inp3 + + DBGPRINTL64 "updated input data ptrs ", inp0, inp1, inp2, inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + +%ifdef SAFE_DATA + ;; Clear stack frame ((16 + 8)*16 bytes) + pxor xmm0, xmm0 +%assign i 0 +%rep (16+NUM_SHA256_DIGEST_WORDS) + movdqa [rsp + i*SZ4], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, STACK_size + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c b/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c new file mode 100644 index 000000000..aadd85633 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/snow3g_sse.c @@ -0,0 +1,42 @@ +/******************************************************************************* + Copyright (c) 2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define SSE +#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_sse +#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_sse +#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_sse +#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_sse +#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_sse +#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_sse +#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_sse +#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_sse +#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_sse +#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_sse +#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_sse +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse + +#include "include/snow3g_common.h" diff --git a/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm b/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm new file mode 100755 index 000000000..0f4e490f9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/zuc_sse.asm @@ -0,0 +1,1152 @@ +;; +;; Copyright (c) 2009-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +extern lookup_8bit_sse + +section .data +default rel +align 64 +S0: +db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb +db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90 +db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac +db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38 +db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b +db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c +db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad +db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8 +db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56 +db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe +db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d +db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23 +db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1 +db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f +db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65 +db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60 + +S1: +db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77 +db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42 +db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1 +db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48 +db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87 +db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb +db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09 +db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9 +db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9 +db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89 +db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4 +db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde +db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21 +db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34 +db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28 +db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2 + +EK_d: +dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF, +dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC + +mask31: +dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF + +align 16 +bit_reverse_table_l: +db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f + +align 16 +bit_reverse_table_h: +db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 + +align 16 +bit_reverse_and_table: +db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f + +align 16 +data_mask_64bits: +dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + +bit_mask_table: +db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe + + +section .text + +%define MASK31 xmm12 + +%define OFS_R1 (16*(4*4)) +%define OFS_R2 (OFS_R1 + (4*4)) +%define OFS_X0 (OFS_R2 + (4*4)) +%define OFS_X1 (OFS_X0 + (4*4)) +%define OFS_X2 (OFS_X1 + (4*4)) +%define OFS_X3 (OFS_X2 + (4*4)) + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET XMM_STORAGE + +%macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 +%ifidn __OUTPUT_FORMAT__, win64 + push rdi + push rsi +%endif + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm7 + movdqu [rsp + 2*16],xmm8 + movdqu [rsp + 3*16],xmm9 + movdqu [rsp + 4*16],xmm10 + movdqu [rsp + 5*16],xmm11 + movdqu [rsp + 6*16],xmm12 + movdqu [rsp + 7*16],xmm13 + movdqu [rsp + 8*16],xmm14 + movdqu [rsp + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15, [rsp + 9*16] + movdqu xmm14, [rsp + 8*16] + movdqu xmm13, [rsp + 7*16] + movdqu xmm12, [rsp + 6*16] + movdqu xmm11, [rsp + 5*16] + movdqu xmm10, [rsp + 4*16] + movdqu xmm9, [rsp + 3*16] + movdqu xmm8, [rsp + 2*16] + movdqu xmm7, [rsp + 1*16] + movdqu xmm6, [rsp + 0*16] +%endif + mov rsp, r14 +%ifidn __OUTPUT_FORMAT__, win64 + pop rsi + pop rdi +%endif + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +; +; make_u31() +; +%macro make_u31 4 + +%define %%Rt %1 +%define %%Ke %2 +%define %%Ek %3 +%define %%Iv %4 + xor %%Rt, %%Rt + shrd %%Rt, %%Iv, 8 + shrd %%Rt, %%Ek, 15 + shrd %%Rt, %%Ke, 9 +%endmacro + + +; +; bits_reorg4() +; +; params +; %1 - round number +; rax - LFSR pointer +; uses +; +; return +; +%macro bits_reorg4 1 + ; + ; xmm15 = LFSR_S15 + ; xmm14 = LFSR_S14 + ; xmm11 = LFSR_S11 + ; xmm9 = LFSR_S9 + ; xmm7 = LFSR_S7 + ; xmm5 = LFSR_S5 + ; xmm2 = LFSR_S2 + ; xmm0 = LFSR_S0 + ; + movdqa xmm15, [rax + ((15 + %1) % 16)*16] + movdqa xmm14, [rax + ((14 + %1) % 16)*16] + movdqa xmm11, [rax + ((11 + %1) % 16)*16] + movdqa xmm9, [rax + (( 9 + %1) % 16)*16] + movdqa xmm7, [rax + (( 7 + %1) % 16)*16] + movdqa xmm5, [rax + (( 5 + %1) % 16)*16] + movdqa xmm2, [rax + (( 2 + %1) % 16)*16] + movdqa xmm0, [rax + (( 0 + %1) % 16)*16] + + pxor xmm1, xmm1 + pslld xmm15, 1 + movdqa xmm3, xmm14 + pblendw xmm3, xmm1, 0xAA + pblendw xmm15, xmm3, 0x55 + + movdqa [rax + OFS_X0], xmm15 ; BRC_X0 + pslld xmm11, 16 + psrld xmm9, 15 + por xmm11, xmm9 + movdqa [rax + OFS_X1], xmm11 ; BRC_X1 + pslld xmm7, 16 + psrld xmm5, 15 + por xmm7, xmm5 + movdqa [rax + OFS_X2], xmm7 ; BRC_X2 + pslld xmm2, 16 + psrld xmm0, 15 + por xmm2, xmm0 + movdqa [rax + OFS_X3], xmm2 ; BRC_X3 +%endmacro + +%macro lookup_single_sbox 2 +%define %%table %1 ; [in] Pointer to table to look up +%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9) + +%ifdef SAFE_LOOKUP + ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10) + ;; and registers for param passing and return (4 regs, OS dependent) + ;; (6*16 + 6*8 = 144 bytes) + sub rsp, 144 + + movdqu [rsp], xmm0 + movdqu [rsp + 16], xmm1 + movdqu [rsp + 32], xmm2 + movdqu [rsp + 48], xmm3 + movdqu [rsp + 64], xmm4 + movdqu [rsp + 80], xmm5 + mov [rsp + 96], r9 + mov [rsp + 104], r10 + +%ifdef LINUX + mov [rsp + 112], rdi + mov [rsp + 120], rsi + mov [rsp + 128], rdx + mov rdi, %%table + mov rsi, %%idx_val + mov rdx, 256 +%else +%ifnidni %%idx_val, rcx + mov [rsp + 112], rcx +%endif +%ifnidni %%idx_val, rdx + mov [rsp + 120], rdx +%endif +%ifnidni %%idx_val, r8 + mov [rsp + 128], r8 +%endif + + mov rdx, %%idx_val + mov rcx, %%table + mov r8, 256 +%endif + mov [rsp + 136], rax + + call lookup_8bit_sse + + ;; Restore all registers + movdqu xmm0, [rsp] + movdqu xmm1, [rsp + 16] + movdqu xmm2, [rsp + 32] + movdqu xmm3, [rsp + 48] + movdqu xmm4, [rsp + 64] + movdqu xmm5, [rsp + 80] + mov r9, [rsp + 96] + mov r10, [rsp + 104] + +%ifdef LINUX + mov rdi, [rsp + 112] + mov rsi, [rsp + 120] + mov rdx, [rsp + 128] +%else +%ifnidni %%idx_val, rcx + mov rcx, [rsp + 112] +%endif +%ifnidni %%idx_val, rdx + mov rdx, [rsp + 120] +%endif +%ifnidni %%idx_val, rcx + mov r8, [rsp + 128] +%endif +%endif + + ;; Move returned value from lookup function, before restoring rax + mov DWORD(%%idx_val), eax + mov rax, [rsp + 136] + + add rsp, 144 + +%else ;; SAFE_LOOKUP + + movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val] + +%endif ;; SAFE_LOOKUP +%endmacro + +; +; sbox_lkup() +; +; params +; %1 R1/R2 table offset +; %2 R1/R2 entry offset +; %3 xmm reg name +; uses +; rcx,rdx,r8,r9,r10,rsi +; return +; +%macro sbox_lkup 3 + pextrb rcx, %3, (0 + (%2 * 4)) + lookup_single_sbox rsi, rcx + + pextrb rdx, %3, (1 + (%2 * 4)) + lookup_single_sbox rdi, rdx + + xor r10, r10 + pextrb r8, %3, (2 + (%2 * 4)) + lookup_single_sbox rsi, r8 + pextrb r9, %3, (3 + (%2 * 4)) + lookup_single_sbox rdi, r9 + + shrd r10d, ecx, 8 + shrd r10d, edx, 8 + shrd r10d, r8d, 8 + shrd r10d, r9d, 8 + mov [rax + %1 + (%2 * 4)], r10d +%endmacro + + +; +; rot_mod32() +; +; uses xmm7 +; +%macro rot_mod32 3 + movdqa %1, %2 + pslld %1, %3 + movdqa xmm7, %2 + psrld xmm7, (32 - %3) + + por %1, xmm7 +%endmacro + + +; +; nonlin_fun4() +; +; params +; %1 == 1, then calculate W +; uses +; +; return +; xmm0 = W value, updates F_R1[] / F_R2[] +; +%macro nonlin_fun4 1 + +%if (%1 == 1) + movdqa xmm0, [rax + OFS_X0] + pxor xmm0, [rax + OFS_R1] + paddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2 +%endif + ; + movdqa xmm1, [rax + OFS_R1] + movdqa xmm2, [rax + OFS_R2] + paddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1 + pxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2 + ; + + movdqa xmm3, xmm1 + movdqa xmm4, xmm1 + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + pslld xmm3, 16 + psrld xmm4, 16 + pslld xmm5, 16 + psrld xmm6, 16 + movdqa xmm1, xmm3 + movdqa xmm2, xmm4 + por xmm1, xmm6 + por xmm2, xmm5 + + ; + rot_mod32 xmm3, xmm1, 2 + rot_mod32 xmm4, xmm1, 10 + rot_mod32 xmm5, xmm1, 18 + rot_mod32 xmm6, xmm1, 24 + pxor xmm1, xmm3 + pxor xmm1, xmm4 + pxor xmm1, xmm5 + pxor xmm1, xmm6 ; XMM1 = U = L1(P) + + sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0] + sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1] + sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2] + sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3] + ; + rot_mod32 xmm3, xmm2, 8 + rot_mod32 xmm4, xmm2, 14 + rot_mod32 xmm5, xmm2, 22 + rot_mod32 xmm6, xmm2, 30 + pxor xmm2, xmm3 + pxor xmm2, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm6 ; XMM2 = V = L2(Q) + ; + + sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0] + sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1] + sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2] + sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3] +%endmacro + + +; +; store_kstr4() +; +; params +; +; uses +; xmm0 as input +; return +; +%macro store_kstr4 0 + pxor xmm0, [rax + OFS_X3] + pextrd r15d, xmm0, 3 + pop r9 ; *pKeyStr4 + pextrd r14d, xmm0, 2 + pop r8 ; *pKeyStr3 + pextrd r13d, xmm0, 1 + pop rdx ; *pKeyStr2 + pextrd r12d, xmm0, 0 + pop rcx ; *pKeyStr1 + mov [r9], r15d + mov [r8], r14d + mov [rdx], r13d + mov [rcx], r12d + add rcx, 4 + add rdx, 4 + add r8, 4 + add r9, 4 + push rcx + push rdx + push r8 + push r9 +%endmacro + + +; +; add_mod31() +; add two 32-bit args and reduce mod (2^31-1) +; params +; %1 - arg1/res +; %2 - arg2 +; uses +; xmm2 +; return +; %1 +%macro add_mod31 2 + paddd %1, %2 + movdqa xmm2, %1 + psrld xmm2, 31 + pand %1, MASK31 + paddd %1, xmm2 +%endmacro + + +; +; rot_mod31() +; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) +; params +; %1 - arg +; %2 - # of bits +; uses +; xmm2 +; return +; %1 +%macro rot_mod31 2 + + movdqa xmm2, %1 + pslld xmm2, %2 + psrld %1, (31 - %2) + + por %1, xmm2 + pand %1, MASK31 +%endmacro + + +; +; lfsr_updt4() +; +; params +; %1 - round number +; uses +; xmm0 as input (ZERO or W) +; return +; +%macro lfsr_updt4 1 + ; + ; xmm1 = LFSR_S0 + ; xmm4 = LFSR_S4 + ; xmm10 = LFSR_S10 + ; xmm13 = LFSR_S13 + ; xmm15 = LFSR_S15 + ; + pxor xmm3, xmm3 + movdqa xmm1, [rax + (( 0 + %1) % 16)*16] + movdqa xmm4, [rax + (( 4 + %1) % 16)*16] + movdqa xmm10, [rax + ((10 + %1) % 16)*16] + movdqa xmm13, [rax + ((13 + %1) % 16)*16] + movdqa xmm15, [rax + ((15 + %1) % 16)*16] + + ; Calculate LFSR feedback + add_mod31 xmm0, xmm1 + rot_mod31 xmm1, 8 + add_mod31 xmm0, xmm1 + rot_mod31 xmm4, 20 + add_mod31 xmm0, xmm4 + rot_mod31 xmm10, 21 + add_mod31 xmm0, xmm10 + rot_mod31 xmm13, 17 + add_mod31 xmm0, xmm13 + rot_mod31 xmm15, 15 + add_mod31 xmm0, xmm15 + + + + movdqa [rax + (( 0 + %1) % 16)*16], xmm0 + + ; LFSR_S16 = (LFSR_S15++) = eax +%endmacro + + +; +; key_expand_4() +; +%macro key_expand_4 2 + movzx r8d, byte [rdi + (%1 + 0)] + movzx r9d, word [rbx + ((%1 + 0)*2)] + movzx r10d, byte [rsi + (%1 + 0)] + make_u31 r11d, r8d, r9d, r10d + mov [rax + (((%1 + 0)*16)+(%2*4))], r11d + + movzx r12d, byte [rdi + (%1 + 1)] + movzx r13d, word [rbx + ((%1 + 1)*2)] + movzx r14d, byte [rsi + (%1 + 1)] + make_u31 r15d, r12d, r13d, r14d + mov [rax + (((%1 + 1)*16)+(%2*4))], r15d +%endmacro + +MKGLOBAL(asm_ZucInitialization_4_sse,function,internal) +asm_ZucInitialization_4_sse: + +%ifdef LINUX + %define pKe rdi + %define pIv rsi + %define pState rdx +%else + %define pKe rcx + %define pIv rdx + %define pState r8 +%endif + + ; Save non-volatile registers + push rbx + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + push rdx + + lea rax, [pState] ; load pointer to LFSR + push pState ; Save LFSR Pointer to stack + + ; setup the key pointer for first buffer key expand + mov rbx, [pKe] ; load the pointer to the array of keys into rbx + + push pKe ; save rdi (key pointer) to the stack + lea rdi, [rbx] ; load the pointer to the first key into rdi + + + ; setup the IV pointer for first buffer key expand + mov rcx, [pIv] ; load the pointer to the array of IV's + push pIv ; save the IV pointer to the stack + lea rsi, [rcx] ; load the first IV pointer + + lea rbx, [EK_d] ; load D variables + + ; Expand key packet 1 + key_expand_4 0, 0 + key_expand_4 2, 0 + key_expand_4 4, 0 + key_expand_4 6, 0 + key_expand_4 8, 0 + key_expand_4 10, 0 + key_expand_4 12, 0 + key_expand_4 14, 0 + + + ;second packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+8] ; load offset to IV 2 in array + lea rsi, [rcx] ; load pointer to IV2 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+8] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + + push rbx ; save Key pointer + push rdx ; save IV pointer + + lea rbx, [EK_d] + + ; Expand key packet 2 + key_expand_4 0, 1 + key_expand_4 2, 1 + key_expand_4 4, 1 + key_expand_4 6, 1 + key_expand_4 8, 1 + key_expand_4 10, 1 + key_expand_4 12, 1 + key_expand_4 14, 1 + + + + ;Third packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+16] ; load offset to IV 3 in array + lea rsi, [rcx] ; load pointer to IV3 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+16] ; load offset to key 3 in array + lea rdi, [rcx] ; load pointer to Key 3 + + push rbx ; save Key pointer + push rdx ; save IV pointer + lea rbx, [EK_d] + ; Expand key packet 3 + key_expand_4 0, 2 + key_expand_4 2, 2 + key_expand_4 4, 2 + key_expand_4 6, 2 + key_expand_4 8, 2 + key_expand_4 10, 2 + key_expand_4 12, 2 + key_expand_4 14, 2 + + + + ;fourth packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+24] ; load offset to IV 4 in array + lea rsi, [rcx] ; load pointer to IV4 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+24] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + lea rbx, [EK_d] + ; Expand key packet 4 + key_expand_4 0, 3 + key_expand_4 2, 3 + key_expand_4 4, 3 + key_expand_4 6, 3 + key_expand_4 8, 3 + key_expand_4 10, 3 + key_expand_4 12, 3 + key_expand_4 14, 3 + + ; Set R1 and R2 to zero + ;xor r10, r10 + ;xor r11, r11 + + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + movdqa xmm12, [mask31] + + ; Shift LFSR 32-times, update state variables +%assign N 0 +%rep 32 + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 N + nonlin_fun4 1 + psrld xmm0,1 ; Shift out LSB of W + + pop rdx + lea rax, [rdx] + push rdx + + lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero +%assign N N+1 +%endrep + + ; And once more, initial round from keygen phase = 33 times + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 0 + nonlin_fun4 0 + + pop rdx + lea rax, [rdx] + + pxor xmm0, xmm0 + lfsr_updt4 0 + + + + ; Restore non-volatile registers + pop rdx + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbx + + ret +; +; +; +;; +;; void asm_ZucGenKeystream64B_4_sse(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4); +;; +;; WIN64 +;; RCX - pSta +;; RDX - pKeyStr1 +;; R8 - pKeyStr2 +;; R9 - pKeyStr3 +;; Stack - pKeyStr4 +;; +;; LIN64 +;; RDI - pSta +;; RSI - pKeyStr1 +;; RDX - pKeyStr2 +;; RCX - pKeyStr3 +;; R8 - pKeyStr4 +;; +MKGLOBAL(asm_ZucGenKeystream64B_4_sse,function,internal) +asm_ZucGenKeystream64B_4_sse: + +%ifdef LINUX + %define pState rdi + %define pKS1 rsi + %define pKS2 rdx + %define pKS3 rcx + %define pKS4 r8 +%else + %define pState rcx + %define pKS1 rdx + %define pKS2 r8 + %define pKS3 r9 + %define pKS4 rax +%endif + +%ifndef LINUX + mov rax, [rsp + 8*5] ; 5th parameter from stack +%endif + + ; Save non-volatile registers + push rbx + push r12 + push r13 + push r14 + push r15 + +%ifndef LINUX + push rdi + push rsi +%endif + ; Store 4 keystream pointers on the stack + + push pKS1 + push pKS2 + push pKS3 + push pKS4 + + + ; Load state pointer in RAX + mov rax, pState + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + movdqa xmm12, [mask31] + + ; Generate 64B of keystream in 16 rounds +%assign N 1 +%rep 16 + bits_reorg4 N + nonlin_fun4 1 + store_kstr4 + pxor xmm0, xmm0 + lfsr_updt4 N +%assign N N+1 +%endrep + + ; Take keystream pointers off (#push = #pops) + pop rax + pop rax + pop rax + pop rax + +%ifndef LINUX + pop rsi + pop rdi +%endif + + ; Restore non-volatile registers + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + +;; +;; extern uint32_t Zuc_Eia3_Remainder_sse(const void *ks, const void *data, uint64_t n_bits) +;; +;; Returns authentication update value to be XOR'ed with current authentication tag +;; +;; WIN64 +;; RCX - KS (key stream pointer) +;; RDX - DATA (data pointer) +;; R8 - N_BITS (number data bits to process) +;; LIN64 +;; RDI - KS (key stream pointer) +;; RSI - DATA (data pointer) +;; RDX - N_BITS (number data bits to process) +;; +align 16 +MKGLOBAL(asm_Eia3RemainderSSE,function,internal) +asm_Eia3RemainderSSE: +%ifdef LINUX + %define KS rdi + %define DATA rsi + %define N_BITS rdx +%else + %define KS rcx + %define DATA rdx + %define N_BITS r8 +%endif + + FUNC_SAVE + + movdqa xmm5, [bit_reverse_table_l] + movdqa xmm6, [bit_reverse_table_h] + movdqa xmm7, [bit_reverse_and_table] + movdqa xmm10, [data_mask_64bits] + + pxor xmm9, xmm9 + +%rep 3 + cmp N_BITS, 128 + jb Eia3RoundsSSE_dq_end + + ;; read 16 bytes and reverse bits + movdqu xmm0, [DATA] + movdqa xmm1, xmm0 + pand xmm1, xmm7 + + movdqa xmm2, xmm7 + pandn xmm2, xmm0 + psrld xmm2, 4 + + movdqa xmm8, xmm6 ; bit reverse low nibbles (use high table) + pshufb xmm8, xmm1 + + movdqa xmm4, xmm5 ; bit reverse high nibbles (use low table) + pshufb xmm4, xmm2 + + por xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS + movdqu xmm3, [KS + (0*4)] + movdqu xmm4, [KS + (2*4)] + pshufd xmm0, xmm3, 0x61 + pshufd xmm1, xmm4, 0x61 + + ;; - set up DATA + movdqa xmm2, xmm8 + pand xmm2, xmm10 + pshufd xmm3, xmm2, 0xdc + movdqa xmm4, xmm3 + + psrldq xmm8, 8 + pshufd xmm13, xmm8, 0xdc + movdqa xmm14, xmm13 + + ;; - clmul + ;; - xor the results from 4 32-bit words together + pclmulqdq xmm3, xmm0, 0x00 + pclmulqdq xmm4, xmm0, 0x11 + pclmulqdq xmm13, xmm1, 0x00 + pclmulqdq xmm14, xmm1, 0x11 + + pxor xmm3, xmm4 + pxor xmm13, xmm14 + pxor xmm9, xmm3 + pxor xmm9, xmm13 + lea DATA, [DATA + 16] + lea KS, [KS + 16] + sub N_BITS, 128 +%endrep +Eia3RoundsSSE_dq_end: + +%rep 3 + cmp N_BITS, 32 + jb Eia3RoundsSSE_dw_end + + ;; swap dwords in KS + movq xmm1, [KS] + pshufd xmm4, xmm1, 0xf1 + + ;; bit-reverse 4 bytes of data + movdqa xmm2, xmm7 + movd xmm0, [DATA] + movdqa xmm1, xmm0 + pand xmm1, xmm2 + + pandn xmm2, xmm0 + psrld xmm2, 4 + + movdqa xmm0, xmm6 ; bit reverse low nibbles (use high table) + pshufb xmm0, xmm1 + + movdqa xmm3, xmm5 ; bit reverse high nibbles (use low table) + pshufb xmm3, xmm2 + + por xmm0, xmm3 + + ;; rol & xor + pclmulqdq xmm0, xmm4, 0 + pxor xmm9, xmm0 + + lea DATA, [DATA + 4] + lea KS, [KS + 4] + sub N_BITS, 32 +%endrep + +Eia3RoundsSSE_dw_end: + movq rax, xmm9 + shr rax, 32 + + or N_BITS, N_BITS + jz Eia3RoundsSSE_byte_loop_end + + ;; get 64-bit key stream for the last data bits (less than 32) + mov KS, [KS] + +; ;; process remaining data bytes and bits +Eia3RoundsSSE_byte_loop: + or N_BITS, N_BITS + jz Eia3RoundsSSE_byte_loop_end + + cmp N_BITS, 8 + jb Eia3RoundsSSE_byte_partial + + movzx r11, byte [DATA] + sub N_BITS, 8 + jmp Eia3RoundsSSE_byte_read + +Eia3RoundsSSE_byte_partial: + ;; process remaining bits (up to 7) + lea r11, [bit_mask_table] + movzx r10, byte [r11 + N_BITS] + movzx r11, byte [DATA] + and r11, r10 + xor N_BITS, N_BITS +Eia3RoundsSSE_byte_read: + +%assign DATATEST 0x80 +%rep 8 + xor r10, r10 + test r11, DATATEST + cmovne r10, KS + xor rax, r10 + rol KS, 1 +%assign DATATEST (DATATEST >> 1) +%endrep ; byte boundary + lea DATA, [DATA + 1] + jmp Eia3RoundsSSE_byte_loop + +Eia3RoundsSSE_byte_loop_end: + + ;; eax - holds the return value at this stage + + FUNC_RESTORE + + ret + +;; +;;extern uint32_t Zuc_Eia3_Round64B_sse(uint32_t T, const void *KS, const void *DATA) +;; +;; Updates authentication tag T based on keystream KS and DATA. +;; - it processes 64 bytes of DATA +;; - reads data in 16 byte chunks and bit reverses them +;; - reads and re-arranges KS +;; - employs clmul for the XOR & ROL part +;; - copies top 64 butes of KS to bottom (for the next round) +;; +;; WIN64 +;; RCX - T +;; RDX - KS pointer to key stream (2 x 64 bytes) +;;; R8 - DATA pointer to data +;; LIN64 +;; RDI - T +;; RSI - KS pointer to key stream (2 x 64 bytes) +;; RDX - DATA pointer to data +;; +align 16 +MKGLOBAL(asm_Eia3Round64BSSE,function,internal) +asm_Eia3Round64BSSE: + +%ifdef LINUX + %define T edi + %define KS rsi + %define DATA rdx +%else + %define T ecx + %define KS rdx + %define DATA r8 +%endif + + FUNC_SAVE + + movdqa xmm5, [bit_reverse_table_l] + movdqa xmm6, [bit_reverse_table_h] + movdqa xmm7, [bit_reverse_and_table] + movdqa xmm10, [data_mask_64bits] + + pxor xmm9, xmm9 + +%assign I 0 +%rep 4 + ;; read 16 bytes and reverse bits + movdqu xmm0, [DATA + 16*I] + movdqa xmm1, xmm0 + pand xmm1, xmm7 + + movdqa xmm2, xmm7 + pandn xmm2, xmm0 + psrld xmm2, 4 + + movdqa xmm8, xmm6 ; bit reverse low nibbles (use high table) + pshufb xmm8, xmm1 + + movdqa xmm4, xmm5 ; bit reverse high nibbles (use low table) + pshufb xmm4, xmm2 + + por xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS +%if I != 0 + movdqa xmm0, xmm12 + movdqu xmm2, [KS + (I*16) + (4*4)] + movdqa xmm12, xmm2 + palignr xmm2, xmm0, 8 + pshufd xmm1, xmm0, 0x61 + pshufd xmm11, xmm2, 0x61 +%else + movdqu xmm2, [KS + (I*16) + (0*4)] + movdqu xmm3, [KS + (I*16) + (4*4)] + movdqa xmm12, xmm3 + palignr xmm3, xmm2, 8 + pshufd xmm1, xmm2, 0x61 + pshufd xmm11, xmm3, 0x61 +%endif + + ;; - set up DATA + movdqa xmm0, xmm8 + pand xmm0, xmm10 + pshufd xmm3, xmm0, 0xdc + movdqa xmm0, xmm3 + + psrldq xmm8, 8 + pshufd xmm13, xmm8, 0xdc + movdqa xmm14, xmm13 + + ;; - clmul + ;; - xor the results from 4 32-bit words together + pclmulqdq xmm0, xmm1, 0x00 + pclmulqdq xmm3, xmm1, 0x11 + pclmulqdq xmm14, xmm11, 0x00 + pclmulqdq xmm13, xmm11, 0x11 + + pxor xmm3, xmm0 + pxor xmm13, xmm14 + pxor xmm9, xmm3 + pxor xmm9, xmm13 + +%assign I (I + 1) +%endrep + + ;; - update T + movq rax, xmm9 + shr rax, 32 + xor eax, T + + FUNC_RESTORE + + ret + + +;---------------------------------------------------------------------------------------- +;---------------------------------------------------------------------------------------- + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c b/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c new file mode 100755 index 000000000..5a4eb98c5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/zuc_sse_top.c @@ -0,0 +1,554 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/*----------------------------------------------------------------------- +* zuc_sse.c +*----------------------------------------------------------------------- +* An implementation of ZUC, the core algorithm for the +* 3GPP Confidentiality and Integrity algorithms. +* +*-----------------------------------------------------------------------*/ + +#include + +#include "include/zuc_internal.h" +#include "include/wireless_common.h" +#include "include/save_xmms.h" +#include "include/clear_regs_mem.h" +#include "intel-ipsec-mb.h" + +#define SAVE_XMMS save_xmms +#define RESTORE_XMMS restore_xmms +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_sse + +static inline +void _zuc_eea3_1_buffer_sse(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) +{ + DECLARE_ALIGNED(ZucState_t zucState, 64); + DECLARE_ALIGNED(uint8_t keyStream[64], 64); + /* buffer to store 64 bytes of keystream */ + DECLARE_ALIGNED(uint8_t tempSrc[64], 64); + DECLARE_ALIGNED(uint8_t tempDst[64], 64); + + const uint64_t *pIn64 = NULL; + const uint8_t *pIn8 = NULL; + uint8_t *pOut8 = NULL; + uint64_t *pOut64 = NULL, *pKeyStream64 = NULL; + uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL; + + uint32_t numKeyStreamsPerPkt = length/ ZUC_KEYSTR_LEN; + uint32_t numBytesLeftOver = length % ZUC_KEYSTR_LEN; + + /* need to set the LFSR state to zero */ + memset(&zucState, 0, sizeof(ZucState_t)); + + /* initialize the zuc state */ + asm_ZucInitialization(pKey, pIv, &(zucState)); + + /* Loop Over all the Quad-Words in input buffer and XOR with the 64bits + * of generated keystream */ + pOut64 = (uint64_t *) pBufferOut; + pIn64 = (const uint64_t *) pBufferIn; + + while (numKeyStreamsPerPkt--) { + /* Generate the key stream 64 bytes at a time */ + asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState); + + /* XOR The Keystream generated with the input buffer here */ + pKeyStream64 = (uint64_t *) keyStream; + asm_XorKeyStream64B_sse(pIn64, pOut64, pKeyStream64); + pIn64 += 8; + pOut64 += 8; + } + + /* Check for remaining 0 to 63 bytes */ + pIn8 = (const uint8_t *) pBufferIn; + pOut8 = (uint8_t *) pBufferOut; + if(numBytesLeftOver) { + asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState); + + /* copy the remaining bytes into temporary buffer and XOR with + * the 64-bytes of keystream. Then copy on the valid bytes back + * to the output buffer */ + + memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver], + numBytesLeftOver); + pKeyStream64 = (uint64_t *) &keyStream[0]; + pTemp64 = (uint64_t *) &tempSrc[0]; + pdstTemp64 = (uint64_t *) &tempDst[0]; + + asm_XorKeyStream64B_sse(pTemp64, pdstTemp64, pKeyStream64); + memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], + numBytesLeftOver); + + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(keyStream, sizeof(keyStream)); + clear_mem(&zucState, sizeof(zucState)); +#endif +} + +static inline +void _zuc_eea3_4_buffer_sse(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t length[4]) +{ + + DECLARE_ALIGNED(ZucState4_t state, 64); + DECLARE_ALIGNED(ZucState_t singlePktState, 64); + + unsigned int i = 0; + /* Calculate the minimum input packet size */ + uint32_t bytes1 = (length[0] < length[1] ? + length[0] : length[1]); + uint32_t bytes2 = (length[2] < length[3] ? + length[2] : length[3]); + /* min number of bytes */ + uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; + uint32_t numKeyStreamsPerPkt = bytes/ZUC_KEYSTR_LEN; + uint32_t remainBytes[4] = {0}; + DECLARE_ALIGNED(uint8_t keyStr1[64], 64); + DECLARE_ALIGNED(uint8_t keyStr2[64], 64); + DECLARE_ALIGNED(uint8_t keyStr3[64], 64); + DECLARE_ALIGNED(uint8_t keyStr4[64], 64); + DECLARE_ALIGNED(uint8_t tempSrc[64], 64); + DECLARE_ALIGNED(uint8_t tempDst[64], 64); + /* structure to store the 4 keys */ + DECLARE_ALIGNED(ZucKey4_t keys, 64); + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(ZucIv4_t ivs, 64); + uint32_t numBytesLeftOver = 0; + const uint8_t *pTempBufInPtr = NULL; + uint8_t *pTempBufOutPtr = NULL; + + const uint64_t *pIn64_0 = NULL; + const uint64_t *pIn64_1 = NULL; + const uint64_t *pIn64_2 = NULL; + const uint64_t *pIn64_3 = NULL; + uint64_t *pOut64_0 = NULL; + uint64_t *pOut64_1 = NULL; + uint64_t *pOut64_2 = NULL; + uint64_t *pOut64_3 = NULL; + uint64_t *pTempSrc64 = NULL; + uint64_t *pTempDst64 = NULL; + uint64_t *pKeyStream64 = NULL; + + /* rounded down minimum length */ + bytes = numKeyStreamsPerPkt * ZUC_KEYSTR_LEN; + + /* Need to set the LFSR state to zero */ + memset(&state, 0, sizeof(ZucState4_t)); + + /* Calculate the number of bytes left over for each packet */ + for (i=0; i< 4; i++) + remainBytes[i] = length[i] - bytes; + + /* Setup the Keys */ + keys.pKey1 = pKey[0]; + keys.pKey2 = pKey[1]; + keys.pKey3 = pKey[2]; + keys.pKey4 = pKey[3]; + + /* setup the IV's */ + ivs.pIv1 = pIv[0]; + ivs.pIv2 = pIv[1]; + ivs.pIv3 = pIv[2]; + ivs.pIv4 = pIv[3]; + + asm_ZucInitialization_4_sse( &keys, &ivs, &state); + + pOut64_0 = (uint64_t *) pBufferOut[0]; + pOut64_1 = (uint64_t *) pBufferOut[1]; + pOut64_2 = (uint64_t *) pBufferOut[2]; + pOut64_3 = (uint64_t *) pBufferOut[3]; + + pIn64_0 = (const uint64_t *) pBufferIn[0]; + pIn64_1 = (const uint64_t *) pBufferIn[1]; + pIn64_2 = (const uint64_t *) pBufferIn[2]; + pIn64_3 = (const uint64_t *) pBufferIn[3]; + + /* Loop for 64 bytes at a time generating 4 key-streams per loop */ + while (numKeyStreamsPerPkt) { + /* Generate 64 bytes at a time */ + asm_ZucGenKeystream64B_4_sse(&state, + (uint32_t *) keyStr1, + (uint32_t *) keyStr2, + (uint32_t *) keyStr3, + (uint32_t *) keyStr4); + + /* XOR the KeyStream with the input buffers and store in output + * buffer*/ + pKeyStream64 = (uint64_t *) keyStr1; + asm_XorKeyStream64B_sse(pIn64_0, pOut64_0, pKeyStream64); + pIn64_0 += 8; + pOut64_0 += 8; + + pKeyStream64 = (uint64_t *) keyStr2; + asm_XorKeyStream64B_sse(pIn64_1, pOut64_1, pKeyStream64); + pIn64_1 += 8; + pOut64_1 += 8; + + pKeyStream64 = (uint64_t *) keyStr3; + asm_XorKeyStream64B_sse(pIn64_2, pOut64_2, pKeyStream64); + pIn64_2 += 8; + pOut64_2 += 8; + + pKeyStream64 = (uint64_t *) keyStr4; + asm_XorKeyStream64B_sse(pIn64_3, pOut64_3, pKeyStream64); + pIn64_3 += 8; + pOut64_3 += 8; + + /* Update keystream count */ + numKeyStreamsPerPkt--; + + } + + /* process each packet separately for the remaining bytes */ + for (i = 0; i < 4; i++) { + if (remainBytes[i]) { + /* need to copy the zuc state to single packet state */ + singlePktState.lfsrState[0] = state.lfsrState[0][i]; + singlePktState.lfsrState[1] = state.lfsrState[1][i]; + singlePktState.lfsrState[2] = state.lfsrState[2][i]; + singlePktState.lfsrState[3] = state.lfsrState[3][i]; + singlePktState.lfsrState[4] = state.lfsrState[4][i]; + singlePktState.lfsrState[5] = state.lfsrState[5][i]; + singlePktState.lfsrState[6] = state.lfsrState[6][i]; + singlePktState.lfsrState[7] = state.lfsrState[7][i]; + singlePktState.lfsrState[8] = state.lfsrState[8][i]; + singlePktState.lfsrState[9] = state.lfsrState[9][i]; + singlePktState.lfsrState[10] = state.lfsrState[10][i]; + singlePktState.lfsrState[11] = state.lfsrState[11][i]; + singlePktState.lfsrState[12] = state.lfsrState[12][i]; + singlePktState.lfsrState[13] = state.lfsrState[13][i]; + singlePktState.lfsrState[14] = state.lfsrState[14][i]; + singlePktState.lfsrState[15] = state.lfsrState[15][i]; + + singlePktState.fR1 = state.fR1[i]; + singlePktState.fR2 = state.fR2[i]; + + singlePktState.bX0 = state.bX0[i]; + singlePktState.bX1 = state.bX1[i]; + singlePktState.bX2 = state.bX2[i]; + singlePktState.bX3 = state.bX3[i]; + + numKeyStreamsPerPkt = remainBytes[i] / ZUC_KEYSTR_LEN; + numBytesLeftOver = remainBytes[i] % ZUC_KEYSTR_LEN; + + pTempBufInPtr = pBufferIn[i]; + pTempBufOutPtr = pBufferOut[i]; + + /* update the output and input pointers here to point + * to the i'th buffers */ + pOut64_0 = (uint64_t *) &pTempBufOutPtr[length[i] - + remainBytes[i]]; + pIn64_0 = (const uint64_t *) &pTempBufInPtr[length[i] - + remainBytes[i]]; + + while (numKeyStreamsPerPkt--) { + /* Generate the key stream 64 bytes at a time */ + asm_ZucGenKeystream64B((uint32_t *) keyStr1, + &singlePktState); + pKeyStream64 = (uint64_t *) keyStr1; + asm_XorKeyStream64B_sse(pIn64_0, pOut64_0, + pKeyStream64); + pIn64_0 += 8; + pOut64_0 += 8; + } + + + /* Check for remaining 0 to 63 bytes */ + if (numBytesLeftOver) { + asm_ZucGenKeystream64B((uint32_t *) &keyStr1, + &singlePktState); + uint32_t offset = length[i] - numBytesLeftOver; + + /* copy the remaining bytes into temporary + * buffer and XOR with the 64-bytes of + * keystream. Then copy on the valid bytes back + * to the output buffer */ + memcpy(&tempSrc[0], &pTempBufInPtr[offset], + numBytesLeftOver); + memset(&tempSrc[numBytesLeftOver], 0, + 64 - numBytesLeftOver); + + pKeyStream64 = (uint64_t *) &keyStr1[0]; + pTempSrc64 = (uint64_t *) &tempSrc[0]; + pTempDst64 = (uint64_t *) &tempDst[0]; + asm_XorKeyStream64B_sse(pTempSrc64, pTempDst64, + pKeyStream64); + + memcpy(&pTempBufOutPtr[offset], + &tempDst[0], numBytesLeftOver); + } + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(keyStr1, sizeof(keyStr1)); + clear_mem(keyStr2, sizeof(keyStr2)); + clear_mem(keyStr3, sizeof(keyStr3)); + clear_mem(keyStr4, sizeof(keyStr4)); + clear_mem(&singlePktState, sizeof(singlePktState)); + clear_mem(&state, sizeof(state)); + clear_mem(&keys, sizeof(keys)); + clear_mem(&ivs, sizeof(ivs)); +#endif +} + +void zuc_eea3_1_buffer_sse(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (length < ZUC_MIN_LEN || length > ZUC_MAX_LEN) + return; +#endif + + _zuc_eea3_1_buffer_sse(pKey, pIv, pBufferIn, pBufferOut, length); + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void zuc_eea3_4_buffer_sse(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t length[4]) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + unsigned int i; + + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL || length == NULL) + return; + + for (i = 0; i < 4; i++) { + if (pKey[i] == NULL || pIv[i] == NULL || + pBufferIn[i] == NULL || pBufferOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN) + return; + } +#endif + + _zuc_eea3_4_buffer_sse(pKey, pIv, pBufferIn, pBufferOut, length); + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + + +void zuc_eea3_n_buffer_sse(const void * const pKey[], const void * const pIv[], + const void * const pBufferIn[], void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + + unsigned int i; + unsigned int packetCount = numBuffers; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL || length == NULL) + return; + + for (i = 0; i < numBuffers; i++) { + if (pKey[i] == NULL || pIv[i] == NULL || + pBufferIn[i] == NULL || pBufferOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN) + return; + } +#endif + i = 0; + + while(packetCount >= 4) { + packetCount -=4; + _zuc_eea3_4_buffer_sse(&pKey[i], + &pIv[i], + &pBufferIn[i], + &pBufferOut[i], + &length[i]); + i+=4; + } + + while(packetCount--) { + _zuc_eea3_1_buffer_sse(pKey[i], + pIv[i], + pBufferIn[i], + pBufferOut[i], + length[i]); + i++; + } + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +static inline uint64_t rotate_left(uint64_t u, size_t r) +{ + return (((u) << (r)) | ((u) >> (64 - (r)))); +} + +static inline uint64_t load_uint64(const void *ptr) +{ + return *((const uint64_t *)ptr); +} + +void zuc_eia3_1_buffer_sse(const void *pKey, + const void *pIv, + const void *pBufferIn, + const uint32_t lengthInBits, + uint32_t *pMacI) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + DECLARE_ALIGNED(ZucState_t zucState, 64); + DECLARE_ALIGNED(uint32_t keyStream[16 * 2], 64); + const uint32_t keyStreamLengthInBits = ZUC_KEYSTR_LEN * 8; + /* generate a key-stream 2 words longer than the input message */ + const uint32_t N = lengthInBits + (2 * ZUC_WORD); + uint32_t L = (N + 31) / ZUC_WORD; + uint32_t *pZuc = (uint32_t *) &keyStream[0]; + uint32_t remainingBits = lengthInBits; + uint32_t T = 0; + const uint8_t *pIn8 = (const uint8_t *) pBufferIn; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBits < ZUC_MIN_LEN || lengthInBits > ZUC_MAX_LEN) + return; +#endif + + memset(&zucState, 0, sizeof(ZucState_t)); + + asm_ZucInitialization(pKey, pIv, &(zucState)); + asm_ZucGenKeystream64B(pZuc, &zucState); + + /* loop over the message bits */ + while (remainingBits >= keyStreamLengthInBits) { + remainingBits -= keyStreamLengthInBits; + L -= (keyStreamLengthInBits / 32); + + /* Generate the next key stream 8 bytes or 64 bytes */ + if (!remainingBits) + asm_ZucGenKeystream8B(&keyStream[16], &zucState); + else + asm_ZucGenKeystream64B(&keyStream[16], &zucState); + T = asm_Eia3Round64BSSE(T, &keyStream[0], pIn8); + memcpy(&keyStream[0], &keyStream[16], 16 * sizeof(uint32_t)); + pIn8 = &pIn8[ZUC_KEYSTR_LEN]; + } + + /* + * If remaining bits has more than 14 ZUC WORDS (double words), + * keystream needs to have up to another 2 ZUC WORDS (8B) + */ + if (remainingBits > (14 * 32)) + asm_ZucGenKeystream8B(&keyStream[16], &zucState); + T ^= asm_Eia3RemainderSSE(&keyStream[0], pIn8, remainingBits); + T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); + + /* save the final MAC-I result */ + uint32_t keyBlock = keyStream[L - 1]; + *pMacI = bswap4(T ^ keyBlock); + +#ifdef SAFE_DATA + /* Clear sensitive data (in registers and stack) */ + clear_mem(keyStream, sizeof(keyStream)); + clear_mem(&zucState, sizeof(zucState)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} -- cgit v1.2.3