summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm528
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm331
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm585
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm346
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm630
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm360
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm366
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm345
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm364
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm162
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm298
-rw-r--r--src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/gcm_sse.asm2166
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm210
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm179
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm225
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm250
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm260
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm254
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm274
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm310
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm268
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm274
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm347
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm341
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm249
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm325
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm335
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm312
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c628
-rw-r--r--src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm777
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm425
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm482
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm507
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm41
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm604
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm517
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm41
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm495
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm439
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm447
54 files changed, 16720 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm
new file mode 100644
index 00000000..3226adee
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm
@@ -0,0 +1,528 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void *in,
+; UINT128 *IV,
+; UINT128 keys[11],
+; void *out,
+; UINT64 len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+%include "os.asm"
+
+%define MOVDQ movdqu
+
+%ifdef LINUX
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%else
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY8 xmm8
+%define XKEY10 xmm9
+%define XIV xmm10
+%define XSAVED0 xmm11
+%define XSAVED1 xmm12
+%define XSAVED2 xmm13
+%define XSAVED3 xmm14
+%define XKEY xmm15
+
+%define IV_TMP XSAVED3
+
+section .text
+
+MKGLOBAL(aes_cbc_dec_128_sse,function,internal)
+aes_cbc_dec_128_sse:
+%ifndef LINUX
+ mov LEN, [rsp + 8*5]
+%endif
+
+ mov TMP, LEN
+ and TMP, 3*16
+ jz initial_4
+ cmp TMP, 2*16
+ jb initial_1
+ ja initial_3
+
+initial_2:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XIV, XDATA1
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+ aesdec XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+ aesdec XDATA1, [KEYS + 3*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+ aesdec XDATA1, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+
+ movdqa XKEY8, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+ aesdec XDATA1, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY8 ; 8. DEC
+ aesdec XDATA1, XKEY8
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+ aesdec XDATA1, [KEYS + 9*16]
+
+ aesdeclast XDATA0, XKEY10 ; 10. DEC
+ aesdeclast XDATA1, XKEY10
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_1:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XIV, XDATA0
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+
+ mov IDX, 1*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+
+ movdqa XKEY8, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+
+ aesdec XDATA0, XKEY8 ; 8. DEC
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+
+ aesdeclast XDATA0, XKEY10 ; 10. DEC
+
+ pxor XDATA0, IV_TMP
+
+ movdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je done
+ jmp main_loop
+
+
+initial_3:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XIV, XDATA2
+
+ movdqa XKEY, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY ; 1. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+
+ movdqa XKEY, [KEYS + 3*16]
+
+ mov IDX, 3*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY ; 3. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+
+ movdqa XKEY, [KEYS + 5*16]
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY ; 5. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+
+ movdqa XKEY, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+
+ movdqa XKEY8, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY ; 7. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+
+ movdqa XKEY, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY8 ; 8. DEC
+ aesdec XDATA1, XKEY8
+ aesdec XDATA2, XKEY8
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY ; 9. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+
+ aesdeclast XDATA0, XKEY10 ; 10. DEC
+ aesdeclast XDATA1, XKEY10
+ aesdeclast XDATA2, XKEY10
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_4:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+ movdqu XDATA3, [IN + 3*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XIV, XDATA3
+
+ movdqa XKEY, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY ; 1. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY ; 3. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 5*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY ; 5. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ movdqa XKEY8, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY ; 7. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY8 ; 8. DEC
+ aesdec XDATA1, XKEY8
+ aesdec XDATA2, XKEY8
+ aesdec XDATA3, XKEY8
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY ; 9. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ aesdeclast XDATA0, XKEY10 ; 10. DEC
+ aesdeclast XDATA1, XKEY10
+ aesdeclast XDATA2, XKEY10
+ aesdeclast XDATA3, XKEY10
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+ movdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz done
+ jmp main_loop
+
+ align 16
+main_loop:
+ ; load cipher text
+ movdqu XDATA0, [IN + IDX + 0*16]
+ movdqu XDATA1, [IN + IDX + 1*16]
+ movdqu XDATA2, [IN + IDX + 2*16]
+ movdqu XDATA3, [IN + IDX + 3*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XSAVED3, XDATA3
+
+ movdqa XKEY, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ aesdec XDATA0, XKEY ; 1. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 3*16]
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ aesdec XDATA0, XKEY ; 3. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ aesdec XDATA0, XKEY ; 5. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ aesdec XDATA0, XKEY ; 7. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ movdqa XKEY, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY8 ; 8. DEC
+ aesdec XDATA1, XKEY8
+ aesdec XDATA2, XKEY8
+ aesdec XDATA3, XKEY8
+
+ aesdec XDATA0, XKEY ; 9. DEC
+ aesdec XDATA1, XKEY
+ aesdec XDATA2, XKEY
+ aesdec XDATA3, XKEY
+
+ aesdeclast XDATA0, XKEY10 ; 10. DEC
+ aesdeclast XDATA1, XKEY10
+ aesdeclast XDATA2, XKEY10
+ aesdeclast XDATA3, XKEY10
+
+ pxor XDATA0, XIV
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ movdqa XIV, XSAVED3
+
+ CMP IDX, LEN
+ jne main_loop
+
+done:
+; Don't write back IV
+; movdqu [IV], XIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm
new file mode 100644
index 00000000..46c6cb3a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to compute CBC-MAC based on 128 bit CBC AES encryptionk code
+
+%define CBC_MAC
+%include "aes_cbc_enc_128_x4.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
new file mode 100644
index 00000000..3df1f74f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
@@ -0,0 +1,331 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "memcpy.asm"
+
+; routine to do AES128 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xbyteswap xmm9
+%define xkey0 xmm10
+%define xkey3 xmm11
+%define xkey6 xmm12
+%define xkey9 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define p_tmp rsp + _buffer
+%define tmp r11
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ movdqa xkey0, [p_keys + 0*16]
+%endif
+
+ movdqa xdata0, xcounter
+ pshufb xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ movdqa CONCAT(xdata,i), xcounter
+ paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+ pshufb CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 1*16]
+
+ pxor xdata0, xkey0
+ paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
+%assign i 1
+%rep (%%by - 1)
+ pxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey3, [p_keys + 3*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+ movdqa xkeyB, [p_keys + 4*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey3 ; key 3
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 4
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey6 ; key 6
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 8*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey9, [p_keys + 9*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 8
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey9 ; key 9
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ aesenclast CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+ pxor CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+%endif
+
+%assign i 0
+%rep %%by
+ MOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+struc STACK
+_buffer: resq 2
+_rsp_save: resq 1
+endstruc
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+align 32
+MKGLOBAL(aes_cntr_128_sse,function,internal)
+aes_cntr_128_sse:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5] ; arg5
+%endif
+
+ movdqa xbyteswap, [rel byteswap_const]
+ test p_ivlen, 16
+ jnz iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ pinsrq xcounter, [p_IV], 0
+ pinsrd xcounter, [p_IV + 8], 2
+ pinsrd xcounter, DWORD(tmp), 3
+bswap_iv:
+ pshufb xcounter, xbyteswap
+
+ mov tmp, num_bytes
+ and tmp, 3*16
+ jz chk ; x4 > or < 15 (not 3 lines)
+
+ ; 1 <= tmp <= 3
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1 ; 1 block
+ add p_out, 1*16
+ jmp chk
+
+eq2:
+ do_aes_load 2 ; 2 blocks
+ add p_out, 2*16
+ jmp chk
+
+eq3:
+ do_aes_load 3 ; 3 blocks
+ add p_out, 3*16
+ ; fall through to chk
+chk:
+ and num_bytes, ~(3*16)
+ jz do_return2
+ cmp num_bytes, 16
+ jb last
+
+ ; process multiples of 4 blocks
+ movdqa xkey0, [p_keys + 0*16]
+ movdqa xkey3, [p_keys + 3*16]
+ movdqa xkey6, [p_keys + 6*16]
+ movdqa xkey9, [p_keys + 9*16]
+ jmp main_loop2
+
+align 32
+main_loop2:
+ ; num_bytes is a multiple of 4 blocks + partial bytes
+ do_aes_noload 4
+ add p_out, 4*16
+ sub num_bytes, 4*16
+ cmp num_bytes, 4*16
+ jae main_loop2
+
+ test num_bytes, 15 ; partial bytes to be processed?
+ jnz last
+
+do_return2:
+ ; don't return updated IV
+; pshufb xcounter, xbyteswap
+; movdqu [p_IV], xcounter
+ ret
+
+last:
+ ;; Code dealing with the partial block cases
+ ; reserve 16 byte aligned buffer on the stack
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+ mov [rsp + _rsp_save], rax ; save SP
+
+ ; copy input bytes into scratch buffer
+ memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
+ ; Encryption of a single partial block (p_tmp)
+ pshufb xcounter, xbyteswap
+ movdqa xdata0, xcounter
+ pxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 9
+ aesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ aesenclast xdata0, [p_keys + 16*i]
+ ; xor keystream with the message (scratch)
+ pxor xdata0, [p_tmp]
+ movdqa [p_tmp], xdata0
+ ; copy result into the output buffer
+ memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
+ ; remove the stack frame
+ mov rsp, [rsp + _rsp_save] ; original SP
+ jmp do_return2
+
+iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ movdqu xcounter, [p_IV]
+ jmp bswap_iv
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm
new file mode 100644
index 00000000..dcc24b7c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm
@@ -0,0 +1,585 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void *in,
+; UINT128 *IV,
+; UINT128 keys[13], // +1 over key length
+; void *out,
+; UINT64 len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+%include "os.asm"
+
+%define MOVDQ movdqu
+
+%ifdef LINUX
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%else
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY10 xmm8
+%define XIV xmm9
+%define XSAVED0 xmm10
+%define XSAVED1 xmm11
+%define XSAVED2 xmm12
+%define XSAVED3 xmm13
+%define XKEY_A xmm14
+%define XKEY_B xmm15
+
+%define IV_TMP XSAVED3
+
+section .text
+
+MKGLOBAL(aes_cbc_dec_192_sse,function,internal)
+aes_cbc_dec_192_sse:
+%ifndef LINUX
+ mov LEN, [rsp + 8*5]
+%endif
+
+ mov TMP, LEN
+ and TMP, 3*16
+ jz initial_4
+ cmp TMP, 2*16
+ jb initial_1
+ ja initial_3
+
+initial_2:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XIV, XDATA1
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+ aesdec XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+ aesdec XDATA1, [KEYS + 3*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+ aesdec XDATA1, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+ aesdec XDATA1, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+ aesdec XDATA1, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+
+ aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
+ aesdec XDATA1, [KEYS + 11*16]
+
+ aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC
+ aesdeclast XDATA1, [KEYS + 12*16]
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_1:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XIV, XDATA0
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+
+ mov IDX, 1*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+
+ aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
+
+ aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC
+
+ pxor XDATA0, IV_TMP
+
+ movdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je done
+ jmp main_loop
+
+
+initial_3:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XIV, XDATA2
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+ mov IDX, 3*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ aesdeclast XDATA0, XKEY_B ; 12. DEC
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+
+
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_4:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+ movdqu XDATA3, [IN + 3*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XIV, XDATA3
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+ aesdec XDATA3, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+
+
+ aesdeclast XDATA0, XKEY_B ; 12. DEC
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+ aesdeclast XDATA3, XKEY_B
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+ movdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz done
+ jmp main_loop
+
+ align 16
+main_loop:
+ ; load cipher text
+ movdqu XDATA0, [IN + IDX + 0*16]
+ movdqu XDATA1, [IN + IDX + 1*16]
+ movdqu XDATA2, [IN + IDX + 2*16]
+ movdqu XDATA3, [IN + IDX + 3*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XSAVED3, XDATA3
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+ aesdec XDATA3, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ aesdeclast XDATA0, XKEY_B ; 12. DECLAST
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+ aesdeclast XDATA3, XKEY_B
+
+ pxor XDATA0, XIV
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ movdqa XIV, XSAVED3
+
+ CMP IDX, LEN
+ jne main_loop
+
+done:
+; Don't write back IV
+; movdqu [IV], XIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm
new file mode 100644
index 00000000..786604df
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm
@@ -0,0 +1,346 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "memcpy.asm"
+
+; routine to do AES192 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xbyteswap xmm9
+%define xkey0 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+%define p_tmp rsp + _buffer
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ movdqa xkey0, [p_keys + 0*16]
+%endif
+
+ movdqa xdata0, xcounter
+ pshufb xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ movdqa CONCAT(xdata,i), xcounter
+ paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+ pshufb CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 1*16]
+
+ pxor xdata0, xkey0
+ paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
+%assign i 1
+%rep (%%by - 1)
+ pxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ movdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ aesenclast CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+ pxor CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+%endif
+
+%assign i 0
+%rep %%by
+ MOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+struc STACK
+_buffer: resq 2
+_rsp_save: resq 1
+endstruc
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; aes_cntr_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+align 32
+MKGLOBAL(aes_cntr_192_sse,function,internal)
+aes_cntr_192_sse:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ movdqa xbyteswap, [rel byteswap_const]
+ test p_ivlen, 16
+ jnz iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ pinsrq xcounter, [p_IV], 0
+ pinsrd xcounter, [p_IV + 8], 2
+ pinsrd xcounter, DWORD(tmp), 3
+bswap_iv:
+ pshufb xcounter, xbyteswap
+
+ mov tmp, num_bytes
+ and tmp, 3*16
+ jz chk ; x4 > or < 15 (not 3 lines)
+
+ ; 1 <= tmp <= 3
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ jmp chk
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ jmp chk
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ ; fall through to chk
+chk:
+ and num_bytes, ~(3*16)
+ jz do_return2
+ cmp num_bytes, 16
+ jb last
+
+ ; process multiples of 4 blocks
+ movdqa xkey0, [p_keys + 0*16]
+ movdqa xkey4, [p_keys + 4*16]
+ movdqa xkey8, [p_keys + 8*16]
+ movdqa xkey12, [p_keys + 12*16]
+ jmp main_loop2
+
+align 32
+main_loop2:
+ ; num_bytes is a multiple of 4 and >0
+ do_aes_noload 4
+ add p_out, 4*16
+ sub num_bytes, 4*16
+ cmp num_bytes, 4*16
+ jae main_loop2
+
+ test num_bytes, 15 ; partial bytes to be processed?
+ jnz last
+
+do_return2:
+ ; don't return updated IV
+; pshufb xcounter, xbyteswap
+; movdqu [p_IV], xcounter
+ ret
+
+last:
+ ;; Code dealing with the partial block cases
+ ; reserve 16 byte aligned buffer on stack
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+ mov [rsp + _rsp_save], rax ; save SP
+
+ ; copy input bytes into scratch buffer
+ memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
+ ; Encryption of a single partial block (p_tmp)
+ pshufb xcounter, xbyteswap
+ movdqa xdata0, xcounter
+ pxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 11
+ aesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ aesenclast xdata0, [p_keys + 16*i]
+ ; xor keystream with the message (scratch)
+ pxor xdata0, [p_tmp]
+ movdqa [p_tmp], xdata0
+ ; copy result into the output buffer
+ memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
+ ; remove the stack frame
+ mov rsp, [rsp + _rsp_save] ; original SP
+ jmp do_return2
+
+iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ movdqu xcounter, [p_IV]
+ jmp bswap_iv
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm
new file mode 100644
index 00000000..c9b4bd3a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm
@@ -0,0 +1,630 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void *in,
+; UINT128 *IV,
+; UINT128 keys[15],
+; void *out,
+; UINT64 len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8: pointer to keys
+; arg 4: r9: pointer to output (plain text)
+; arg 5: sp: length in bytes (multiple of 16)
+;
+
+%include "os.asm"
+
+%define MOVDQ movdqu
+
+%ifdef LINUX
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%else
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY10 xmm8
+%define XIV xmm9
+%define XSAVED0 xmm10
+%define XSAVED1 xmm11
+%define XSAVED2 xmm12
+%define XSAVED3 xmm13
+%define XKEY_A xmm14
+%define XKEY_B xmm15
+
+%define IV_TMP XSAVED3
+
+section .text
+
+MKGLOBAL(aes_cbc_dec_256_sse,function,internal)
+aes_cbc_dec_256_sse:
+%ifndef LINUX
+ mov LEN, [rsp + 8*5]
+%endif
+
+ mov TMP, LEN
+ and TMP, 3*16
+ jz initial_4
+ cmp TMP, 2*16
+ jb initial_1
+ ja initial_3
+
+initial_2:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XIV, XDATA1
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+ aesdec XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+ aesdec XDATA1, [KEYS + 3*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+ aesdec XDATA1, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+ aesdec XDATA1, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+ aesdec XDATA1, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+
+ aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
+ aesdec XDATA1, [KEYS + 11*16]
+
+ aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
+ aesdec XDATA1, [KEYS + 12*16]
+
+ aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
+ aesdec XDATA1, [KEYS + 13*16]
+
+ aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
+ aesdeclast XDATA1, [KEYS + 14*16]
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_1:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XIV, XDATA0
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, [KEYS + 1*16] ; 1. DEC
+
+ mov IDX, 1*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, [KEYS + 3*16] ; 3. DEC
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, [KEYS + 5*16] ; 5. DEC
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, [KEYS + 7*16] ; 7. DEC
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, [KEYS + 9*16] ; 9. DEC
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+
+ aesdec XDATA0, [KEYS + 11*16] ; 11. DEC
+
+ aesdec XDATA0, [KEYS + 12*16] ; 12. DEC
+
+ aesdec XDATA0, [KEYS + 13*16] ; 13. DEC
+
+ aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC
+
+ pxor XDATA0, IV_TMP
+
+ movdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je done
+ jmp main_loop
+
+
+initial_3:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XIV, XDATA2
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+ mov IDX, 3*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ aesdec XDATA0, XKEY_B ; 12. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ aesdec XDATA0, XKEY_A ; 13. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+
+ aesdeclast XDATA0, XKEY_B ; 14. DEC
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je done
+ jmp main_loop
+
+
+ align 16
+initial_4:
+ ; load cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+ movdqu XDATA3, [IN + 3*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XIV, XDATA3
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ movdqu IV_TMP, [IV]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+ aesdec XDATA3, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ aesdec XDATA0, XKEY_B ; 12. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ aesdec XDATA0, XKEY_A ; 13. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ aesdeclast XDATA0, XKEY_B ; 14. DEC
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+ aesdeclast XDATA3, XKEY_B
+
+ pxor XDATA0, IV_TMP
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+ movdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz done
+ jmp main_loop
+
+ align 16
+main_loop:
+ ; load cipher text
+ movdqu XDATA0, [IN + IDX + 0*16]
+ movdqu XDATA1, [IN + IDX + 1*16]
+ movdqu XDATA2, [IN + IDX + 2*16]
+ movdqu XDATA3, [IN + IDX + 3*16]
+
+ ; save cipher text
+ movdqa XSAVED0, XDATA0
+ movdqa XSAVED1, XDATA1
+ movdqa XSAVED2, XDATA2
+ movdqa XSAVED3, XDATA3
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ aesdec XDATA0, XKEY_A ; 1. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ aesdec XDATA0, XKEY2 ; 2. DEC
+ aesdec XDATA1, XKEY2
+ aesdec XDATA2, XKEY2
+ aesdec XDATA3, XKEY2
+
+ aesdec XDATA0, XKEY_A ; 3. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ aesdec XDATA0, XKEY4 ; 4. DEC
+ aesdec XDATA1, XKEY4
+ aesdec XDATA2, XKEY4
+ aesdec XDATA3, XKEY4
+
+ aesdec XDATA0, XKEY_A ; 5. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ aesdec XDATA0, XKEY6 ; 6. DEC
+ aesdec XDATA1, XKEY6
+ aesdec XDATA2, XKEY6
+ aesdec XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ aesdec XDATA0, XKEY_A ; 7. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ aesdec XDATA0, XKEY_B ; 8. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ aesdec XDATA0, XKEY_A ; 9. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ aesdec XDATA0, XKEY10 ; 10. DEC
+ aesdec XDATA1, XKEY10
+ aesdec XDATA2, XKEY10
+ aesdec XDATA3, XKEY10
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ aesdec XDATA0, XKEY_A ; 11. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ aesdec XDATA0, XKEY_B ; 12. DEC
+ aesdec XDATA1, XKEY_B
+ aesdec XDATA2, XKEY_B
+ aesdec XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ aesdec XDATA0, XKEY_A ; 13. DEC
+ aesdec XDATA1, XKEY_A
+ aesdec XDATA2, XKEY_A
+ aesdec XDATA3, XKEY_A
+
+ aesdeclast XDATA0, XKEY_B ; 14. DEC
+ aesdeclast XDATA1, XKEY_B
+ aesdeclast XDATA2, XKEY_B
+ aesdeclast XDATA3, XKEY_B
+
+ pxor XDATA0, XIV
+ pxor XDATA1, XSAVED0
+ pxor XDATA2, XSAVED1
+ pxor XDATA3, XSAVED2
+
+ movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ movdqa XIV, XSAVED3
+
+ CMP IDX, LEN
+ jne main_loop
+
+done:
+; Don't write back IV
+; movdqu [IV], XIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
new file mode 100644
index 00000000..114f20bd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm
@@ -0,0 +1,360 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "memcpy.asm"
+
+; routine to do AES256 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xbyteswap xmm9
+%define xkey0 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+%define p_tmp rsp + _buffer
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ movdqa xkey0, [p_keys + 0*16]
+%endif
+
+ movdqa xdata0, xcounter
+ pshufb xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ movdqa CONCAT(xdata,i), xcounter
+ paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+ pshufb CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 1*16]
+
+ pxor xdata0, xkey0
+ paddd xcounter, [rel CONCAT(ddq_add_,%%by)]
+%assign i 1
+%rep (%%by - 1)
+ pxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ movdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ movdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyA, [p_keys + 13*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+ movdqa xkeyB, [p_keys + 14*16]
+%assign i 0
+%rep %%by
+ aesenc CONCAT(xdata,i), xkeyA ; key 13
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ aesenclast CONCAT(xdata,i), xkeyB ; key 14
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ MOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+ pxor CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ MOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ pxor CONCAT(xdata,i), xkeyA
+%endif
+
+%assign i 0
+%rep %%by
+ MOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+struc STACK
+_buffer: resq 2
+_rsp_save: resq 1
+endstruc
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+align 32
+MKGLOBAL(aes_cntr_256_sse,function,internal)
+aes_cntr_256_sse:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ movdqa xbyteswap, [rel byteswap_const]
+ test p_ivlen, 16
+ jnz iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ pinsrq xcounter, [p_IV], 0
+ pinsrd xcounter, [p_IV + 8], 2
+ pinsrd xcounter, DWORD(tmp), 3
+bswap_iv:
+ pshufb xcounter, xbyteswap
+
+ mov tmp, num_bytes
+ and tmp, 3*16
+ jz chk ; x4 > or < 15 (not 3 lines)
+
+ ; 1 <= tmp <= 3
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ jmp chk
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ jmp chk
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ ; fall through to chk
+chk:
+ and num_bytes, ~(3*16)
+ jz do_return2
+ cmp num_bytes, 16
+ jb last
+
+ ; process multiples of 4 blocks
+ movdqa xkey0, [p_keys + 0*16]
+ movdqa xkey4, [p_keys + 4*16]
+ movdqa xkey8, [p_keys + 8*16]
+ movdqa xkey12, [p_keys + 12*16]
+ jmp main_loop2
+
+align 32
+main_loop2:
+ ; num_bytes is a multiple of 4 and >0
+ do_aes_noload 4
+ add p_out, 4*16
+ sub num_bytes, 4*16
+ cmp num_bytes, 4*16
+ jae main_loop2
+
+ test num_bytes, 15 ; partial bytes to be processed?
+ jnz last
+
+do_return2:
+ ; don't return updated IV
+; pshufb xcounter, xbyteswap
+; movdqu [p_IV], xcounter
+ ret
+
+last:
+ ;; Code dealing with the partial block cases
+ ; reserve 16 byte aligned buffer on stack
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+ mov [rsp + _rsp_save], rax ; save SP
+
+ ; copy input bytes into scratch buffer
+ memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax
+ ; Encryption of a single partial block (p_tmp)
+ pshufb xcounter, xbyteswap
+ movdqa xdata0, xcounter
+ pxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 13
+ aesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ aesenclast xdata0, [p_keys + 16*i]
+ ; xor keystream with the message (scratch)
+ pxor xdata0, [p_tmp]
+ movdqa [p_tmp], xdata0
+ ; copy result into the output buffer
+ memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax
+ ; remove the stack frame
+ mov rsp, [rsp + _rsp_save] ; original SP
+ jmp do_return2
+
+iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ movdqu xcounter, [p_IV]
+ jmp bswap_iv
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm
new file mode 100644
index 00000000..13c324be
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm
@@ -0,0 +1,366 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to do a 128 bit CBC AES encryption / CBC-MAC digest computation
+;;; processes 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS_x8 {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_x4(AES_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 8
+endstruc
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi ;r8
+%define arg4 rsi ;r9
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IDX rax
+
+%define IN0 r8
+%define KEYS0 rbx
+
+%define IN1 r10
+%define KEYS1 arg3
+
+%define IN2 r12
+%define KEYS2 arg4
+
+%define IN3 r14
+%define KEYS3 rbp
+
+%ifndef CBC_MAC
+;; No cipher text write back for CBC-MAC
+%define OUT0 r9
+%define OUT1 r11
+%define OUT2 r13
+%define OUT3 r15
+%endif
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+
+%define XKEY0_3 xmm4
+%define XKEY0_6 [KEYS0 + 16*6]
+%define XTMP xmm5
+%define XKEY0_9 xmm6
+
+%define XKEY1_3 xmm7
+%define XKEY1_6 xmm8
+%define XKEY1_9 xmm9
+
+%define XKEY2_3 xmm10
+%define XKEY2_6 xmm11
+%define XKEY2_9 xmm12
+
+%define XKEY3_3 xmm13
+%define XKEY3_6 xmm14
+%define XKEY3_9 xmm15
+
+section .text
+
+%ifdef CBC_MAC
+MKGLOBAL(aes128_cbc_mac_x4,function,internal)
+aes128_cbc_mac_x4:
+%else
+MKGLOBAL(aes_cbc_enc_128_x4,function,internal)
+aes_cbc_enc_128_x4:
+%endif
+ sub rsp, STACK_size
+ mov [rsp + _gpr_save + 8*0], rbp
+%ifdef CBC_MAC
+ mov [rsp + _gpr_save + 8*1], rbx
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+%endif
+ mov IDX, 16
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN1, [ARG + _aesarg_in + 8*1]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN3, [ARG + _aesarg_in + 8*3]
+
+ MOVDQ XDATA0, [IN0] ; load first block of plain text
+ MOVDQ XDATA1, [IN1] ; load first block of plain text
+ MOVDQ XDATA2, [IN2] ; load first block of plain text
+ MOVDQ XDATA3, [IN3] ; load first block of plain text
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+
+ pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+%ifndef CBC_MAC
+ mov OUT0, [ARG + _aesarg_out + 8*0]
+ mov OUT1, [ARG + _aesarg_out + 8*1]
+ mov OUT2, [ARG + _aesarg_out + 8*2]
+ mov OUT3, [ARG + _aesarg_out + 8*3]
+%endif
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+ movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key
+ movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key
+ movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key
+ movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key
+ movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key
+ movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key
+ movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key
+ movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ MOVDQ [OUT0], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1], XDATA1 ; write back ciphertext
+ MOVDQ [OUT2], XDATA2 ; write back ciphertext
+ MOVDQ [OUT3], XDATA3 ; write back ciphertext
+%endif
+ cmp LEN, IDX
+ je done
+
+main_loop:
+ pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV
+ pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV
+ pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV
+ pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ ;; No cipher text write back for CBC-MAC
+ MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertext
+ MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertext
+ MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertext
+%endif
+
+ add IDX, 16
+ cmp LEN, IDX
+ jne main_loop
+
+done:
+ ;; update IV / store digest for CBC-MAC
+ movdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ movdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ movdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ movdqa [ARG + _aesarg_IV + 16*3], XDATA3
+
+ ;; update IN and OUT
+ add IN0, LEN
+ mov [ARG + _aesarg_in + 8*0], IN0
+ add IN1, LEN
+ mov [ARG + _aesarg_in + 8*1], IN1
+ add IN2, LEN
+ mov [ARG + _aesarg_in + 8*2], IN2
+ add IN3, LEN
+ mov [ARG + _aesarg_in + 8*3], IN3
+
+%ifndef CBC_MAC
+ ;; No OUT pointer updates for CBC-MAC
+ add OUT0, LEN
+ mov [ARG + _aesarg_out + 8*0], OUT0
+ add OUT1, LEN
+ mov [ARG + _aesarg_out + 8*1], OUT1
+ add OUT2, LEN
+ mov [ARG + _aesarg_out + 8*2], OUT2
+ add OUT3, LEN
+ mov [ARG + _aesarg_out + 8*3], OUT3
+%endif
+
+%ifdef CBC_MAC
+ mov rbx, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+%endif
+ mov rbp, [rsp + _gpr_save + 8*0]
+ add rsp, STACK_size
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm
new file mode 100644
index 00000000..e442d633
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm
@@ -0,0 +1,345 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS_x8 {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_x4(AES_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+
+%define IN0 r8
+%define KEYS0 rbx
+%define OUT0 r9
+
+%define IN1 r10
+%define KEYS1 REG3
+%define OUT1 r11
+
+%define IN2 r12
+%define KEYS2 REG4
+%define OUT2 r13
+
+%define IN3 r14
+%define KEYS3 rbp
+%define OUT3 r15
+
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+
+%define XKEY0_3 xmm4
+%define XKEY0_6 [KEYS0 + 16*6]
+%define XTMP xmm5
+%define XKEY0_9 xmm6
+
+%define XKEY1_3 xmm7
+%define XKEY1_6 xmm8
+%define XKEY1_9 xmm9
+
+%define XKEY2_3 xmm10
+%define XKEY2_6 xmm11
+%define XKEY2_9 xmm12
+
+%define XKEY3_3 xmm13
+%define XKEY3_6 xmm14
+%define XKEY3_9 xmm15
+
+section .text
+
+MKGLOBAL(aes_cbc_enc_192_x4,function,internal)
+aes_cbc_enc_192_x4:
+
+ push rbp
+
+ mov IDX, 16
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN1, [ARG + _aesarg_in + 8*1]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN3, [ARG + _aesarg_in + 8*3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ MOVDQ XDATA0, [IN0] ; load first block of plain text
+ MOVDQ XDATA1, [IN1] ; load first block of plain text
+ MOVDQ XDATA2, [IN2] ; load first block of plain text
+ MOVDQ XDATA3, [IN3] ; load first block of plain text
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+
+ pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+ mov OUT0, [ARG + _aesarg_out + 8*0]
+ mov OUT1, [ARG + _aesarg_out + 8*1]
+ mov OUT2, [ARG + _aesarg_out + 8*2]
+ mov OUT3, [ARG + _aesarg_out + 8*3]
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+ movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key
+ movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key
+ movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key
+ movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key
+ movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key
+ movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key
+ movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key
+ movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+
+ MOVDQ [OUT0], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1], XDATA1 ; write back ciphertext
+ MOVDQ [OUT2], XDATA2 ; write back ciphertext
+ MOVDQ [OUT3], XDATA3 ; write back ciphertext
+
+ cmp LEN, IDX
+ je done
+
+main_loop:
+ pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV
+ pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV
+ pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV
+ pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV
+
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+
+
+
+ MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex
+ MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex
+ MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex
+
+
+ add IDX, 16
+ cmp LEN, IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ movdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ movdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ movdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ movdqa [ARG + _aesarg_IV + 16*3], XDATA3
+
+ ;; update IN and OUT
+ add IN0, LEN
+ mov [ARG + _aesarg_in + 8*0], IN0
+ add IN1, LEN
+ mov [ARG + _aesarg_in + 8*1], IN1
+ add IN2, LEN
+ mov [ARG + _aesarg_in + 8*2], IN2
+ add IN3, LEN
+ mov [ARG + _aesarg_in + 8*3], IN3
+
+ add OUT0, LEN
+ mov [ARG + _aesarg_out + 8*0], OUT0
+ add OUT1, LEN
+ mov [ARG + _aesarg_out + 8*1], OUT1
+ add OUT2, LEN
+ mov [ARG + _aesarg_out + 8*2], OUT2
+ add OUT3, LEN
+ mov [ARG + _aesarg_out + 8*3], OUT3
+
+ pop rbp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm
new file mode 100644
index 00000000..937e2dbe
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm
@@ -0,0 +1,364 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In and Out pointers at end
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS_x8 {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_x4(AES_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+
+%define IN0 r8
+%define KEYS0 rbx
+%define OUT0 r9
+
+%define IN1 r10
+%define KEYS1 REG3
+%define OUT1 r11
+
+%define IN2 r12
+%define KEYS2 REG4
+%define OUT2 r13
+
+%define IN3 r14
+%define KEYS3 rbp
+%define OUT3 r15
+
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+
+%define XKEY0_3 xmm4
+%define XKEY0_6 [KEYS0 + 16*6]
+%define XTMP xmm5
+%define XKEY0_9 xmm6
+
+%define XKEY1_3 xmm7
+%define XKEY1_6 xmm8
+%define XKEY1_9 xmm9
+
+%define XKEY2_3 xmm10
+%define XKEY2_6 xmm11
+%define XKEY2_9 xmm12
+
+%define XKEY3_3 xmm13
+%define XKEY3_6 xmm14
+%define XKEY3_9 xmm15
+
+section .text
+
+MKGLOBAL(aes_cbc_enc_256_x4,function,internal)
+aes_cbc_enc_256_x4:
+
+ push rbp
+
+ mov IDX, 16
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN1, [ARG + _aesarg_in + 8*1]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN3, [ARG + _aesarg_in + 8*3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ MOVDQ XDATA0, [IN0] ; load first block of plain text
+ MOVDQ XDATA1, [IN1] ; load first block of plain text
+ MOVDQ XDATA2, [IN2] ; load first block of plain text
+ MOVDQ XDATA3, [IN3] ; load first block of plain text
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+
+ pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+
+ mov OUT0, [ARG + _aesarg_out + 8*0]
+ mov OUT1, [ARG + _aesarg_out + 8*1]
+ mov OUT2, [ARG + _aesarg_out + 8*2]
+ mov OUT3, [ARG + _aesarg_out + 8*3]
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+ movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key
+ movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key
+ movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key
+ movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key
+ movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key
+ movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key
+ movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key
+ movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+
+ MOVDQ [OUT0], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1], XDATA1 ; write back ciphertext
+ MOVDQ [OUT2], XDATA2 ; write back ciphertext
+ MOVDQ [OUT3], XDATA3 ; write back ciphertext
+
+ cmp LEN, IDX
+ je done
+
+main_loop:
+ pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV
+ pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV
+ pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV
+ pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV
+
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+
+
+ MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext
+ MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex
+ MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex
+ MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex
+
+
+ add IDX, 16
+ cmp LEN, IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ movdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ movdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ movdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ movdqa [ARG + _aesarg_IV + 16*3], XDATA3
+
+ ;; update IN and OUT
+ add IN0, LEN
+ mov [ARG + _aesarg_in + 8*0], IN0
+ add IN1, LEN
+ mov [ARG + _aesarg_in + 8*1], IN1
+ add IN2, LEN
+ mov [ARG + _aesarg_in + 8*2], IN2
+ add IN3, LEN
+ mov [ARG + _aesarg_in + 8*3], IN3
+
+ add OUT0, LEN
+ mov [ARG + _aesarg_out + 8*0], OUT0
+ add OUT1, LEN
+ mov [ARG + _aesarg_out + 8*1], OUT1
+ add OUT2, LEN
+ mov [ARG + _aesarg_out + 8*2], OUT2
+ add OUT3, LEN
+ mov [ARG + _aesarg_out + 8*3], OUT3
+
+ pop rbp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm
new file mode 100644
index 00000000..0b160c23
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm
@@ -0,0 +1,162 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "memcpy.asm"
+
+;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only.
+;;; It processes only one buffer at a time.
+;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX R9 R10
+;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0
+;;
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + 5*8]
+%endif
+
+%define OUT arg1
+%define IN arg2
+%define IV arg3
+%define KEYS arg4
+%ifdef LINUX
+%define LEN arg5
+%else
+%define LEN2 arg5
+%define LEN r11
+%endif
+
+%define TMP0 rax
+%define TMP1 r10
+%define PTR0 rsp + _buffer
+
+%define XDATA xmm0
+
+section .text
+
+struc STACK
+_buffer: resq 2
+_rsp_save: resq 1
+endstruc
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys)
+;; arg 1: OUT : addr to put clear/cipher text out
+;; arg 2: IN : addr to take cipher/clear text from
+;; arg 3: IV : initialization vector
+;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned)
+;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16)
+;;
+;; AES CFB128 one block encrypt/decrypt implementation.
+;; The function doesn't update IV. The result of operation can be found in OUT.
+;;
+;; It is primarly designed to process partial block of
+;; DOCSIS 3.1 AES Packet PDU Encryption (I.10)
+;;
+;; It process up to one block only (up to 16 bytes).
+;;
+;; It makes sure not to read more than LEN bytes from IN and
+;; not to store more than LEN bytes to OUT.
+
+MKGLOBAL(aes_cfb_128_one_sse,function,)
+align 32
+aes_cfb_128_one_sse:
+%ifndef LINUX
+ mov LEN, LEN2
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+ mov [rsp + _rsp_save], rax
+
+ test LEN, 16
+ jz copy_in_lt16
+ movdqu XDATA, [IN]
+ movdqa [PTR0], XDATA
+ jmp copy_in_end
+copy_in_lt16:
+ memcpy_sse_16 PTR0, IN, LEN, TMP0, TMP1
+copy_in_end:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu XDATA, [IV] ; IV (or next to last block)
+ pxor XDATA, [KEYS + 16*0] ; 0. ARK
+ aesenc XDATA, [KEYS + 16*1] ; 1. ENC
+ aesenc XDATA, [KEYS + 16*2] ; 2. ENC
+ aesenc XDATA, [KEYS + 16*3] ; 3. ENC
+ aesenc XDATA, [KEYS + 16*4] ; 4. ENC
+ aesenc XDATA, [KEYS + 16*5] ; 5. ENC
+ aesenc XDATA, [KEYS + 16*6] ; 6. ENC
+ aesenc XDATA, [KEYS + 16*7] ; 7. ENC
+ aesenc XDATA, [KEYS + 16*8] ; 8. ENC
+ aesenc XDATA, [KEYS + 16*9] ; 9. ENC
+ aesenclast XDATA, [KEYS + 16*10] ; 10. ENC
+
+ pxor XDATA, [PTR0] ; plaintext/ciphertext XOR block cipher encryption
+
+ test LEN, 16
+ jz copy_out_lt16
+ movdqu [OUT], XDATA
+ jmp copy_out_end
+copy_out_lt16:
+ movdqa [PTR0], XDATA
+ memcpy_sse_16 OUT, PTR0, LEN, TMP0, TMP1
+copy_out_end:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
new file mode 100644
index 00000000..b806a817
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
@@ -0,0 +1,298 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do 128 bit AES XCBC
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In pointer at end
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_XCBC_ARGS_x8 {
+;; void* in[8];
+;; UINT128* keys[8];
+;; UINT128 ICV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+
+%define IN0 r8
+%define KEYS0 rbx
+%define OUT0 r9
+
+%define IN1 r10
+%define KEYS1 REG3
+%define OUT1 r11
+
+%define IN2 r12
+%define KEYS2 REG4
+%define OUT2 r13
+
+%define IN3 r14
+%define KEYS3 rbp
+%define OUT3 r15
+
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+
+%define XKEY0_3 xmm4
+%define XKEY0_6 [KEYS0 + 16*6]
+%define XTMP xmm5
+%define XKEY0_9 xmm6
+
+%define XKEY1_3 xmm7
+%define XKEY1_6 xmm8
+%define XKEY1_9 xmm9
+
+%define XKEY2_3 xmm10
+%define XKEY2_6 xmm11
+%define XKEY2_9 xmm12
+
+%define XKEY3_3 xmm13
+%define XKEY3_6 xmm14
+%define XKEY3_9 xmm15
+
+section .text
+
+MKGLOBAL(aes_xcbc_mac_128_x4,function,internal)
+aes_xcbc_mac_128_x4:
+
+ push rbp
+
+ mov IDX, 16
+
+ mov IN0, [ARG + _aesxcbcarg_in + 8*0]
+ mov IN1, [ARG + _aesxcbcarg_in + 8*1]
+ mov IN2, [ARG + _aesxcbcarg_in + 8*2]
+ mov IN3, [ARG + _aesxcbcarg_in + 8*3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ MOVDQ XDATA0, [IN0] ; load first block of plain text
+ MOVDQ XDATA1, [IN1] ; load first block of plain text
+ MOVDQ XDATA2, [IN2] ; load first block of plain text
+ MOVDQ XDATA3, [IN3] ; load first block of plain text
+
+ mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3]
+
+ pxor XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV
+ pxor XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV
+ pxor XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV
+ pxor XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+ movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key
+ movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key
+ movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key
+ movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key
+ movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key
+ movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key
+ movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key
+ movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ cmp LEN, IDX
+ je done
+
+main_loop:
+ pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR ICV
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ add IDX, 16
+ cmp LEN, IDX
+ jne main_loop
+
+done:
+ ;; update ICV
+ movdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0
+ movdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1
+ movdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2
+ movdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3
+
+ ;; update IN
+ add IN0, LEN
+ mov [ARG + _aesxcbcarg_in + 8*0], IN0
+ add IN1, LEN
+ mov [ARG + _aesxcbcarg_in + 8*1], IN1
+ add IN2, LEN
+ mov [ARG + _aesxcbcarg_in + 8*2], IN2
+ add IN3, LEN
+ mov [ARG + _aesxcbcarg_in + 8*3], IN3
+
+ pop rbp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm
new file mode 100644
index 00000000..c622e726
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm
@@ -0,0 +1,30 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define GCM128_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm
new file mode 100644
index 00000000..c37c3be6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2017-2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm
new file mode 100644
index 00000000..646020b5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
new file mode 100644
index 00000000..6b283608
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm
@@ -0,0 +1,2166 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "os.asm"
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_sse.asm!"
+%endif
+%endif
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+ mov r13, [%%GDATA_CTX + PBlockLen]
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_dec_done:
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_encode_done:
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %8
+%define %%HASH_KEY %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192)
+movdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192)
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA_KEY+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep NROUNDS ; do early (13) rounds (11 for GCM192)
+ movdqu %%T_key, [%%GDATA_KEY+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+%ifdef GCM128_MODE
+ movdqu %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*12] ; finish last key round
+%endif
+%ifdef GCM256_MODE
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*12]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*13]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+%ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+%endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+%ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+%endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep NROUNDS
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GCM_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GCM_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ pinsrq xmm2, [r10], 0
+ pinsrd xmm2, [r10+8], 2
+ movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
+; struct has been initialized by GCM_INIT.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ mov r12, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], r12 ;Update length of data processed
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+%endif
+ movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+ movdqu [%%GDATA_CTX + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA_CTX + AadHash]
+ movdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
+; (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm
new file mode 100644
index 00000000..fdc5bbb1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_sse
+%include "mb_mgr_aes_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm
new file mode 100644
index 00000000..9b8ed393
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_sse
+%include "mb_mgr_aes_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm
new file mode 100644
index 00000000..44e2ed12
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_sse
+%include "mb_mgr_aes_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm
new file mode 100644
index 00000000..615d13ee
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_sse
+%include "mb_mgr_aes_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm
new file mode 100644
index 00000000..db216e9f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm
@@ -0,0 +1,210 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X4
+%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_sse
+%endif
+
+; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X4
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define good_lane rdx
+%define iv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp rbp
+%define idx rbp
+
+%define tmp3 r8
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+
+ ; copy good_lane to empty lanes
+ mov tmp1, [state + _aes_args_in + good_lane*8]
+ mov tmp2, [state + _aes_args_out + good_lane*8]
+ mov tmp3, [state + _aes_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ movdqa xmm2, [state + _aes_args_IV + good_lane]
+ movdqa xmm0, [state + _aes_lens]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _aes_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_args_in + I*8], tmp1
+ mov [state + _aes_args_out + I*8], tmp2
+ mov [state + _aes_args_keys + I*8], tmp3
+ movdqa [state + _aes_args_IV + I*16], xmm2
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X4
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+; Don't write back IV
+; mov iv, [job_rax + _iv]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 8
+ or unused_lanes, idx
+; shl idx, 4 ; multiply by 16
+ mov [state + _aes_unused_lanes], unused_lanes
+; movdqa xmm0, [state + _aes_args_IV + idx]
+; movdqu [iv], xmm0
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm
new file mode 100644
index 00000000..ddbd03fd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm
@@ -0,0 +1,179 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X4
+%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_sse
+%endif
+
+; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X4
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+section .text
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ mov len, [job + _msg_len_to_cipher_in_bytes]
+ and len, -16 ; DOCSIS may pass size unaligned to block size
+ mov iv, [job + _iv]
+ mov [state + _aes_unused_lanes], unused_lanes
+
+ mov [state + _aes_job_in_lane + lane*8], job
+ mov [state + _aes_lens + 2*lane], WORD(len)
+
+ mov tmp, [job + _src]
+ add tmp, [job + _cipher_start_src_offset_in_bytes]
+ movdqu xmm0, [iv]
+ mov [state + _aes_args_in + lane*8], tmp
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_args_keys + lane*8], tmp
+ mov tmp, [job + _dst]
+ mov [state + _aes_args_out + lane*8], tmp
+ shl lane, 4 ; multiply by 16
+ movdqa [state + _aes_args_IV + lane], xmm0
+
+ cmp unused_lanes, 0xff
+ jne return_null
+
+ ; Find min length
+ movdqa xmm0, [state + _aes_lens]
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X4
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+; Don't write back IV
+; mov iv, [job_rax + _iv]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 8
+ or unused_lanes, idx
+; shl idx, 4 ; multiply by 16
+ mov [state + _aes_unused_lanes], unused_lanes
+; movdqa xmm0, [state + _aes_args_IV + idx]
+; movdqu [iv], xmm0
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm
new file mode 100644
index 00000000..18562745
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm
@@ -0,0 +1,225 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse
+%endif
+
+; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X4
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define icv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp r10
+%define idx rbp
+
+%define tmp3 r8
+%define lane_data r9
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal)
+FLUSH_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ mov tmp1, [state + _aes_xcbc_args_in + idx*8]
+ mov tmp3, [state + _aes_xcbc_args_keys + idx*8]
+ shl idx, 4 ; multiply by 16
+ movdqa xmm2, [state + _aes_xcbc_args_ICV + idx]
+ movdqa xmm0, [state + _aes_xcbc_lens]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_xcbc_args_in + I*8], tmp1
+ mov [state + _aes_xcbc_args_keys + I*8], tmp3
+ movdqa [state + _aes_xcbc_args_ICV + I*16], xmm2
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ movdqa [state + _aes_xcbc_lens], xmm0
+
+ ; Find min length
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X4
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+ mov word [state + _aes_xcbc_lens + 2*idx], 16
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp copy_lane_data
+
+end_loop:
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ movdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ movq [icv], xmm0
+ pextrd [icv + 8], xmm0, 2
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm
new file mode 100644
index 00000000..fbf3d5de
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm
@@ -0,0 +1,250 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse
+%endif
+
+; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X4
+
+section .data
+default rel
+
+align 16
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define idx rbp
+%define last_len rbp
+
+%define lane r8
+
+%define icv r9
+%define p2 r9
+
+%define tmp r10
+%define len r11
+%define lane_data r12
+%define p r13
+%define tmp2 r14
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
+SUBMIT_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov [lane_data + _xcbc_job_in_lane], job
+ mov dword [lane_data + _xcbc_final_done], 0
+ mov tmp, [job + _k1_expanded]
+ mov [state + _aes_xcbc_args_keys + lane*8], tmp
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+
+ mov last_len, len
+
+ cmp len, 16
+ jle small_buffer
+
+ mov [state + _aes_xcbc_args_in + lane*8], p
+ add p, len ; set point to end of data
+
+ and last_len, 15 ; Check lsbs of msg len
+ jnz slow_copy ; if not 16B mult, do slow copy
+
+fast_copy:
+ movdqu xmm0, [p - 16] ; load last block M[n]
+ mov tmp, [job + _k2] ; load K2 address
+ movdqu xmm1, [tmp] ; load K2
+ pxor xmm0, xmm1 ; M[n] XOR K2
+ movdqa [lane_data + _xcbc_final_block], xmm0
+ sub len, 16 ; take last block off length
+end_fast_copy:
+
+ mov [state + _aes_xcbc_lens + 2*lane], WORD(len)
+
+ pxor xmm0, xmm0
+ shl lane, 4 ; multiply by 16
+ movdqa [state + _aes_xcbc_args_ICV + lane], xmm0
+
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _aes_xcbc_lens]
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X4
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+ mov word [state + _aes_xcbc_lens + 2*idx], 16
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp start_loop
+
+end_loop:
+ ; process completed job "idx"
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ movdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ movq [icv], xmm0
+ pextrd [icv + 8], xmm0, 2
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+small_buffer:
+ ; For buffers <= 16 Bytes
+ ; The input data is set to final block
+ lea tmp, [lane_data + _xcbc_final_block] ; final block
+ mov [state + _aes_xcbc_args_in + lane*8], tmp
+ add p, len ; set point to end of data
+ cmp len, 16
+ je fast_copy
+
+slow_copy:
+ and len, ~15 ; take final block off len
+ sub p, last_len ; adjust data pointer
+ lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
+ sub p2, last_len ; adjust data pointer backwards
+ memcpy_sse_16_1 p2, p, last_len, tmp, tmp2
+ movdqa xmm0, [rel x80] ; fill reg with padding
+ movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding
+ movdqu xmm0, [p2] ; load final block to process
+ mov tmp, [job + _k3] ; load K3 address
+ movdqu xmm1, [tmp] ; load K3
+ pxor xmm0, xmm1 ; M[n] XOR K3
+ movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm
new file mode 100644
index 00000000..0aaad7e5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm
@@ -0,0 +1,260 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8
+;; Windows preserves: RBX RBP RSI RDI R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RSI RDI R8
+;; Linux preserves: RBX RCX RDX RBP R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha1_ni
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+one:
+ dq 1
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define p2 r8
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : state
+MKGLOBAL(flush_job_hmac_ni_sse,function,internal)
+flush_job_hmac_ni_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha1-ni-sse flush"
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job, assume it is 0 then check 1
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ DBGPRINTL64 "idx:", idx
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+ movzx len2, word [state + _lens + idx*2]
+
+ DBGPRINTL64 "ptr", tmp
+
+ ; there are only two lanes so if one is empty it is easy to determine which one
+ xor idx, 1
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ xor idx, 1
+
+ ; No need to find min length - only two lanes available
+ cmp len2, 0
+ je len_is_0
+
+ ; Set length on both lanes to 0
+ mov dword [state + _lens], 0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_ni
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ DBGPRINTL64 "outer-block-index", idx
+ lea tmp, [lane_data + _outer_block]
+ DBGPRINTL64 "outer block ptr:", tmp
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ ;; idx determines which column
+ ;; read off from consecutive rows
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+ lea p2, [idx + idx*4]
+ movdqu xmm0, [state + _args_digest + p2*4]
+ pshufb xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE]
+ bswap DWORD(tmp)
+ movdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0
+ DBGPRINTL64 "sha1 outer hash input word 4", tmp
+ mov job, [lane_data + _job_in_lane]
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+ movdqu [state + _args_digest + p2*4], xmm0
+ mov [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ DBGPRINTL64 "extra blocks-start offset", start_offset
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ DBGPRINTL64 "extra blocks-len", extra_blocks
+ lea tmp, [lane_data + _extra_block + start_offset]
+ DBGPRINTL64 "extra block ptr", tmp
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+ lea idx, [idx + idx*4]
+ mov DWORD(tmp2), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp3)
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm
new file mode 100644
index 00000000..66b894d8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm
@@ -0,0 +1,254 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha1_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : state
+MKGLOBAL(flush_job_hmac_sse,function,internal)
+flush_job_hmac_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha1-sse flush"
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ movdqa xmm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ movdqa [state + _lens], xmm0
+
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_sse
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ ;; idx determines which column
+ ;; read off from consecutive rows
+ movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ pshufb xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ movdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+ DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0
+ DBGPRINTL64 "sha1 outer hash input word 4", tmp
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp3)
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm
new file mode 100644
index 00000000..73878e46
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm
@@ -0,0 +1,274 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+extern md5_x4x2_sse
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+; unused_lanes must be in rax-rdx
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_md5_sse,function,internal)
+flush_job_hmac_md5_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel three]
+ cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel four]
+ cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel five]
+ cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel six]
+ cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel seven]
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ movdqa xmm0, [state + _lens_md5]
+ mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ movdqa [state + _lens_md5], xmm0
+
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshufb xmm1, [rel dupw] ; duplicate words across all lanes
+ psubw xmm0, xmm1
+ movdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_md5 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+; pshufb xmm0, [byteswap wrt rip]
+ movdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_md5]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+; bswap DWORD(tmp2)
+; bswap DWORD(tmp4)
+; bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp3)
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
new file mode 100644
index 00000000..cfe60e78
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
@@ -0,0 +1,310 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "memcpy.asm"
+%include "reg_sizes.asm"
+
+extern md5_x4x2_sse
+
+section .data
+default rel
+align 16
+;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_md5_sse,function,internal)
+submit_job_hmac_md5_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov [state + _unused_lanes_md5], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens_md5 + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ movdqu xmm0, [p - 64 + 0*16]
+ movdqu xmm1, [p - 64 + 1*16]
+ movdqu xmm2, [p - 64 + 2*16]
+ movdqu xmm3, [p - 64 + 3*16]
+ movdqa [lane_data + _extra_block + 0*16], xmm0
+ movdqa [lane_data + _extra_block + 1*16], xmm1
+ movdqa [lane_data + _extra_block + 2*16], xmm2
+ movdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+; bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ movdqu xmm0, [tmp]
+ movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens_md5 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens_md5]
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshufb xmm1, [rel dupw] ; duplicate words across all lanes
+ psubw xmm0, xmm1
+ movdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_md5 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+; pshufb xmm0, [rel byteswap]
+ movdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exiting
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_md5]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm
new file mode 100644
index 00000000..a31a009c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "mb_mgr_hmac_sha_256_flush_ni_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm
new file mode 100644
index 00000000..ae945944
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_224_sse
+%define SHA224
+
+%include "mb_mgr_hmac_sha_256_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm
new file mode 100644
index 00000000..e98ac895
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "mb_mgr_hmac_sha_256_submit_ni_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm
new file mode 100644
index 00000000..ef1a6780
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_224_sse
+%define SHA224
+
+%include "mb_mgr_hmac_sha_256_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm
new file mode 100644
index 00000000..413a5944
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm
@@ -0,0 +1,268 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha256_ni
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%define bswap_xmm4 xmm4
+
+struc STACK
+_gpr_save: resq 4 ;rbx, rbp, rsi (win), rdi (win)
+_rsp_save: resq 1
+endstruc
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+one: dq 1
+
+section .text
+
+%ifdef SHA224
+;; JOB* flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state)
+;; arg1 : state
+MKGLOBAL(flush_job_hmac_sha_224_ni_sse,function,internal)
+flush_job_hmac_sha_224_ni_sse:
+%else
+;; JOB* flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state)
+;; arg1 : state
+MKGLOBAL(flush_job_hmac_sha_256_ni_sse,function,internal)
+flush_job_hmac_sha_256_ni_sse:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha256-ni-sse flush"
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job, assume it is 0 then check 1
+ xor idx, idx
+ cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ DBGPRINTL64 "idx:", idx
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
+ xor len2, len2
+ mov WORD(len2), word [state + _lens_sha256 + idx*2]
+
+ ; there are only two lanes so if one is empty it is easy to determine which one
+ xor idx, 1
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+ xor idx, 1
+
+ ; No need to find min length - only two lanes available
+ cmp len2, 0
+ je len_is_0
+
+ ; set length on both lanes to 0
+ mov dword [state + _lens_sha256], 0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_ni
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ movdqa bswap_xmm4, [rel byteswap]
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+ lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32
+ movdqu xmm0, [state + _args_digest_sha256 + tmp4*4]
+ movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
+ pshufb xmm0, bswap_xmm4
+ pshufb xmm1, bswap_xmm4
+ movdqa [lane_data + _outer_block], xmm0
+ movdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ ;; overwrite top 4 bytes with 0x80
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+ DBGPRINTL "sha256 outer hash input words:"
+ DBGPRINT_XMM xmm0
+ DBGPRINT_XMM xmm1
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+ DBGPRINTL64 "auth_key_xor_opad", tmp
+ movdqu [state + _args_digest_sha256 + tmp4*4], xmm0
+ movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
+ DBGPRINTL "new digest args"
+ DBGPRINT_XMM xmm0
+ DBGPRINT_XMM xmm1
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 16 bytes for SHA256, 14 bytes for SHA224
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+ shl idx, 5
+ movdqu xmm0, [state + _args_digest_sha256 + idx]
+ pshufb xmm0, bswap_xmm4
+%ifdef SHA224
+ ;; SHA224
+ movq [p + 0*4], xmm0
+ pextrd [p + 2*4], xmm0, 2
+ pextrw [p + 3*4], xmm0, 6
+%else
+ ;; SHA256
+ movdqu [p], xmm0
+%endif
+
+ DBGPRINTL "auth_tag_output:"
+ DBGPRINT_XMM xmm0
+
+return:
+ DBGPRINTL "exit sha256-ni-sse flush"
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm
new file mode 100644
index 00000000..bcaa7f89
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm
@@ -0,0 +1,274 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+extern sha_256_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_256_sse
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 3
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ movdqa xmm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + 8*I], tmp
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ movdqa [state + _lens_sha256], xmm0
+
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ pshufb xmm0, [rel byteswap]
+ movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ pshufb xmm1, [rel byteswap]
+ movdqa [lane_data + _outer_block], xmm0
+ movdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+ movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 14 bytes for SHA224 and 16 bytes for SHA256
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+ bswap DWORD(tmp5)
+
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp6)
+
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm
new file mode 100644
index 00000000..60dc04b0
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm
@@ -0,0 +1,347 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha256_ni
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%define bswap_xmm4 xmm4
+
+struc STACK
+_gpr_save: resq 4 ; rbx, rbp, rsi (win), rdi (win)
+_rsp_save: resq 1
+endstruc
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef SHA224
+; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal)
+submit_job_hmac_sha_224_ni_sse:
+
+%else
+
+; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal)
+submit_job_hmac_sha_256_ni_sse:
+%endif
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha256-ni-sse submit"
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ movzx lane, BYTE(unused_lanes)
+ DBGPRINTL64 "lane: ", lane
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ DBGPRINTL64 "length: ", len
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens_sha256 + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + 8*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ movdqu xmm0, [p - 64 + 0*16]
+ movdqu xmm1, [p - 64 + 1*16]
+ movdqu xmm2, [p - 64 + 2*16]
+ movdqu xmm3, [p - 64 + 3*16]
+ movdqa [lane_data + _extra_block + 0*16], xmm0
+ movdqa [lane_data + _extra_block + 1*16], xmm1
+ movdqa [lane_data + _extra_block + 2*16], xmm2
+ movdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+ lea tmp, [lane*8] ; x8 here plus x4 scale factor give x32
+ movdqu [state + _args_digest_sha256 + tmp*4], xmm0
+ movdqu [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1
+ DBGPRINTL "args digest:"
+ DBGPRINT_XMM xmm0
+ DBGPRINT_XMM xmm1
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length - only two lanes available
+ xor len2, len2
+ mov tmp, 0x10000
+ mov WORD(len2), word [state + _lens_sha256 + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0)
+ mov WORD(tmp), word [state + _lens_sha256 + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1)
+ cmp WORD(len2), WORD(tmp)
+ cmovg DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length
+
+ mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields
+ shr DWORD(idx), 16
+ and DWORD(len2), 0xffff
+ je len_is_0
+
+ sub word [state + _lens_sha256 + 0*2], WORD(len2)
+ sub word [state + _lens_sha256 + 1*2], WORD(len2)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_ni
+ ; state is intact
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ movdqa bswap_xmm4, [rel byteswap]
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+ lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32
+ movdqu xmm0, [state + _args_digest_sha256 + tmp4*4]
+ movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
+ pshufb xmm0, bswap_xmm4
+ pshufb xmm1, bswap_xmm4
+ movdqa [lane_data + _outer_block], xmm0
+ movdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ ;; overwrite top 4 bytes with 0x80
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+ movdqu [state + _args_digest_sha256 + tmp4*4], xmm0
+ movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exit
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 16 bytes for SHA256, 14 for SHA224
+%if SHA256NI_DIGEST_ROW_SIZE != 32
+%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
+%endif
+ shl idx, 5
+ movdqu xmm0, [state + _args_digest_sha256 + idx]
+ pshufb xmm0, bswap_xmm4
+%ifdef SHA224
+ ;; SHA224
+ movq [p + 0*4], xmm0
+ pextrd [p + 2*4], xmm0, 2
+ pextrw [p + 3*4], xmm0, 6
+%else
+ ;; SHA256
+ movdqu [p], xmm0
+%endif
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm
new file mode 100644
index 00000000..a63990c1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm
@@ -0,0 +1,341 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+
+extern sha_256_mult_sse
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_256_sse
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*3], rsi
+ mov [rsp + _gpr_save + 8*4], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens_sha256 + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + 8*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ movdqu xmm0, [p - 64 + 0*16]
+ movdqu xmm1, [p - 64 + 1*16]
+ movdqu xmm2, [p - 64 + 2*16]
+ movdqu xmm3, [p - 64 + 3*16]
+ movdqa [lane_data + _extra_block + 0*16], xmm0
+ movdqa [lane_data + _extra_block + 1*16], xmm1
+ movdqa [lane_data + _extra_block + 2*16], xmm2
+ movdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+ movd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ movd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ pextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ pextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ pextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens_sha256]
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ pshufb xmm0, [rel byteswap]
+ movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ pshufb xmm1, [rel byteswap]
+ movdqa [lane_data + _outer_block], xmm0
+ movdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ movdqu xmm1, [tmp + 4*4]
+ movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exit
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 14 bytes for SHA224 and 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*3]
+ mov rdi, [rsp + _gpr_save + 8*4]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm
new file mode 100644
index 00000000..846b09aa
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_384_sse
+%define SHA_X_DIGEST_SIZE 384
+
+%include "mb_mgr_hmac_sha_512_flush_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm
new file mode 100644
index 00000000..2ae645ab
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_384_sse
+%define SHA_X_DIGEST_SIZE 384
+
+%include "mb_mgr_hmac_sha_512_submit_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm
new file mode 100644
index 00000000..b96e370b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm
@@ -0,0 +1,249 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+extern sha512_x2_sse
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+one: dq 1
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_512_sse
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel one]
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ movdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ movdqa [state + _lens_sha512], xmm0
+
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0xA0
+ psubw xmm0, xmm1
+ movdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8*16))
+ movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE]
+ pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1) *SHA512_DIGEST_ROW_SIZE], 1
+ pshufb xmm0, [rel byteswap]
+ movdqa [lane_data + _outer_block_sha512 + I*16], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ movdqu xmm0, [tmp + I * 16]
+ movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm
new file mode 100644
index 00000000..579121dd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm
@@ -0,0 +1,325 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+
+extern sha512_x2_sse
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_512_sse
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512+ lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of sha512 blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+ mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+%assign I 0
+%rep 2
+ movdqu xmm0, [p - 128 + I*4*16 + 0*16]
+ movdqu xmm1, [p - 128 + I*4*16 + 1*16]
+ movdqu xmm2, [p - 128 + I*4*16 + 2*16]
+ movdqu xmm3, [p - 128 + I*4*16 + 3*16]
+ movdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0
+ movdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1
+ movdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2
+ movdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3
+%assign I (I+1)
+%endrep
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ %assign I 0
+ %rep 4
+ movdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
+ movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0
+ pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens_sha512]
+ phminposuw xmm1, xmm0
+ pextrw DWORD(len2), xmm1, 0 ; min value
+ pextrw DWORD(idx), xmm1, 1 ; min index (0...1)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0XA0
+ psubw xmm0, xmm1
+ movdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE]
+ pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ pshufb xmm0, [rel byteswap]
+ movdqa [lane_data + _outer_block_sha512 + I*16], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ movdqu xmm0, [tmp + I*16]
+ movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 16
+copy_lt128:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_sse_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm
new file mode 100644
index 00000000..a066c00a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm
@@ -0,0 +1,335 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha1_ni
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+%define p4 rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+%define p3 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_ni_sse,function,internal)
+submit_job_hmac_ni_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha1-ni-sse submit"
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ DBGPRINTL64 "lane: ", lane
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ DBGPRINTL64 "length: ", len
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ DBGPRINTL64 "src pointer + offset:", p
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ movdqu xmm0, [p - 64 + 0*16]
+ movdqu xmm1, [p - 64 + 1*16]
+ movdqu xmm2, [p - 64 + 2*16]
+ movdqu xmm3, [p - 64 + 3*16]
+ movdqa [lane_data + _extra_block + 0*16], xmm0
+ movdqa [lane_data + _extra_block + 1*16], xmm1
+ movdqa [lane_data + _extra_block + 2*16], xmm2
+ movdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+ lea p4, [lane + lane*4]
+ movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
+ mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length - only two lanes available
+ xor len2, len2
+ mov p3, 0x10000
+ mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0)
+ mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1)
+ cmp WORD(len2), WORD(p3)
+ cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length
+
+ mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields
+ shr DWORD(idx), 16
+ and DWORD(len2), 0xffff
+ je len_is_0
+
+ sub word [state + _lens + 0*2], WORD(len2)
+ sub word [state + _lens + 1*2], WORD(len2)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_ni
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+ lea p3, [idx + idx*4]
+ movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE]
+ pshufb xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE]
+ bswap DWORD(tmp)
+ movdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE]
+ movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
+ mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+%if SHA1NI_DIGEST_ROW_SIZE != 20
+%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
+%endif
+ lea idx, [idx + 4*idx]
+ mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm
new file mode 100644
index 00000000..deab9085
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm
@@ -0,0 +1,312 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+%include "memcpy.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+extern sha1_mult_sse
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rdi, rsi, rbx, rbp
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_sse,function, internal)
+submit_job_hmac_sse:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "enter sha1-sse submit"
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ movdqu xmm0, [p - 64 + 0*16]
+ movdqu xmm1, [p - 64 + 1*16]
+ movdqu xmm2, [p - 64 + 2*16]
+ movdqu xmm3, [p - 64 + 3*16]
+ movdqa [lane_data + _extra_block + 0*16], xmm0
+ movdqa [lane_data + _extra_block + 1*16], xmm1
+ movdqa [lane_data + _extra_block + 2*16], xmm2
+ movdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens]
+ phminposuw xmm1, xmm0
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_sse
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ pshufb xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ movdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ movdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c
new file mode 100644
index 00000000..e78c8c6e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c
@@ -0,0 +1,628 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __WIN32
+#include <intrin.h>
+#endif
+
+#include "intel-ipsec-mb.h"
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_sse(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_sse(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_sse(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_sse(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state);
+
+
+JOB_AES_HMAC *submit_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state);
+
+#define SAVE_XMMS save_xmms
+#define RESTORE_XMMS restore_xmms
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_sse
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_sse
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_sse
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_sse
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_sse
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_sse
+#define SUBMIT_JOB_HMAC submit_job_hmac_sse
+#define FLUSH_JOB_HMAC flush_job_hmac_sse
+#define SUBMIT_JOB_HMAC_NI submit_job_hmac_ni_sse
+#define FLUSH_JOB_HMAC_NI flush_job_hmac_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_sse
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_sse
+#define SUBMIT_JOB_HMAC_SHA_224_NI submit_job_hmac_sha_224_ni_sse
+#define FLUSH_JOB_HMAC_SHA_224_NI flush_job_hmac_sha_224_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_sse
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_sse
+#define SUBMIT_JOB_HMAC_SHA_256_NI submit_job_hmac_sha_256_ni_sse
+#define FLUSH_JOB_HMAC_SHA_256_NI flush_job_hmac_sha_256_ni_sse
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_sse
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_sse
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_sse
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_sse
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_sse
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_sse
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse
+
+#define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_sse
+#define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_sse
+#define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_sse
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_sse
+#define AES_CBC_DEC_192 aes_cbc_dec_192_sse
+#define AES_CBC_DEC_256 aes_cbc_dec_256_sse
+
+#define AES_CNTR_128 aes_cntr_128_sse
+#define AES_CNTR_192 aes_cntr_192_sse
+#define AES_CNTR_256 aes_cntr_256_sse
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_sse
+#define AES_GCM_ENC_128 aes_gcm_enc_128_sse
+#define AES_GCM_DEC_192 aes_gcm_dec_192_sse
+#define AES_GCM_ENC_192 aes_gcm_enc_192_sse
+#define AES_GCM_DEC_256 aes_gcm_dec_256_sse
+#define AES_GCM_ENC_256 aes_gcm_enc_256_sse
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_sse
+#define FLUSH_JOB flush_job_sse
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_sse
+
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse
+#define QUEUE_SIZE queue_size_sse
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_SSE
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_SSE
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_SSE
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_SSE
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_SSE
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_sse
+
+void aes128_cbc_mac_x4(AES_ARGS_x8 *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x4
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch
+#define AES_CCM_MAX_JOBS 4
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_arch
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_arch
+#define AES_CMAC_MAX_JOBS 4
+
+/* ====================================================================== */
+
+/*
+ * Used to decide if SHA1/SHA256 SIMD or SHA1NI OOO scheduler should be
+ * called.
+ */
+#define HASH_USE_SHAEXT 1
+
+/* ====================================================================== */
+
+struct cpuid_regs {
+ uint32_t eax;
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+};
+
+/*
+ * A C wrapper for CPUID opcode
+ *
+ * Parameters:
+ * [in] leaf - CPUID leaf number (EAX)
+ * [in] subleaf - CPUID sub-leaf number (ECX)
+ * [out] out - registers structure to store results of CPUID into
+ */
+static void
+__mbcpuid(const unsigned leaf, const unsigned subleaf,
+ struct cpuid_regs *out)
+{
+#ifdef _WIN32
+ /* Windows */
+ int regs[4];
+
+ __cpuidex(regs, leaf, subleaf);
+ out->eax = regs[0];
+ out->ebx = regs[1];
+ out->ecx = regs[2];
+ out->edx = regs[3];
+#else
+ /* Linux */
+#ifdef __x86_64__
+ asm volatile("mov %4, %%eax\n\t"
+ "mov %5, %%ecx\n\t"
+ "cpuid\n\t"
+ "mov %%eax, %0\n\t"
+ "mov %%ebx, %1\n\t"
+ "mov %%ecx, %2\n\t"
+ "mov %%edx, %3\n\t"
+ : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx),
+ "=g" (out->edx)
+ : "g" (leaf), "g" (subleaf)
+ : "%eax", "%ebx", "%ecx", "%edx");
+#else
+ asm volatile("push %%ebx\n\t"
+ "mov %4, %%eax\n\t"
+ "mov %5, %%ecx\n\t"
+ "cpuid\n\t"
+ "mov %%eax, %0\n\t"
+ "mov %%ebx, %1\n\t"
+ "mov %%ecx, %2\n\t"
+ "mov %%edx, %3\n\t"
+ "pop %%ebx\n\t"
+ : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx),
+ "=g" (out->edx)
+ : "g" (leaf), "g" (subleaf)
+ : "%eax", "%ecx", "%edx");
+#endif
+#endif /* Linux */
+}
+
+/*
+ * Uses CPUID instruction to detected presence of SHA extensions.
+ *
+ * Return value:
+ * 0 - SHA extensions not present
+ * 1 - SHA extensions present
+ */
+static int
+sha_extensions_supported(void)
+{
+ struct cpuid_regs r;
+
+ /* Check highest leaf number. If less then 7 then SHA not supported. */
+ __mbcpuid(0x0, 0x0, &r);
+ if (r.eax < 0x7)
+ return 0;
+
+ /* Check presence of SHA extensions in the extended feature flags */
+ __mbcpuid(0x7, 0x0, &r);
+ if (r.ebx & (1 << 29))
+ return 1;
+
+ return 0;
+}
+
+void
+init_mb_mgr_sse(MB_MGR *state)
+{
+ unsigned int j;
+ uint8_t *p;
+
+ state->features &= (~IMB_FEATURE_SHANI);
+ if (!(state->flags & IMB_FLAG_SHANI_OFF))
+ if (sha_extensions_supported())
+ state->features |= IMB_FEATURE_SHANI;
+
+ /* Init AES out-of-order fields */
+ state->aes128_ooo.lens[0] = 0;
+ state->aes128_ooo.lens[1] = 0;
+ state->aes128_ooo.lens[2] = 0;
+ state->aes128_ooo.lens[3] = 0;
+ state->aes128_ooo.lens[4] = 0xFFFF;
+ state->aes128_ooo.lens[5] = 0xFFFF;
+ state->aes128_ooo.lens[6] = 0xFFFF;
+ state->aes128_ooo.lens[7] = 0xFFFF;
+ state->aes128_ooo.unused_lanes = 0xFF03020100;
+ state->aes128_ooo.job_in_lane[0] = NULL;
+ state->aes128_ooo.job_in_lane[1] = NULL;
+ state->aes128_ooo.job_in_lane[2] = NULL;
+ state->aes128_ooo.job_in_lane[3] = NULL;
+
+ state->aes192_ooo.lens[0] = 0;
+ state->aes192_ooo.lens[1] = 0;
+ state->aes192_ooo.lens[2] = 0;
+ state->aes192_ooo.lens[3] = 0;
+ state->aes192_ooo.lens[4] = 0xFFFF;
+ state->aes192_ooo.lens[5] = 0xFFFF;
+ state->aes192_ooo.lens[6] = 0xFFFF;
+ state->aes192_ooo.lens[7] = 0xFFFF;
+ state->aes192_ooo.unused_lanes = 0xFF03020100;
+ state->aes192_ooo.job_in_lane[0] = NULL;
+ state->aes192_ooo.job_in_lane[1] = NULL;
+ state->aes192_ooo.job_in_lane[2] = NULL;
+ state->aes192_ooo.job_in_lane[3] = NULL;
+
+ state->aes256_ooo.lens[0] = 0;
+ state->aes256_ooo.lens[1] = 0;
+ state->aes256_ooo.lens[2] = 0;
+ state->aes256_ooo.lens[3] = 0;
+ state->aes256_ooo.lens[4] = 0xFFFF;
+ state->aes256_ooo.lens[5] = 0xFFFF;
+ state->aes256_ooo.lens[6] = 0xFFFF;
+ state->aes256_ooo.lens[7] = 0xFFFF;
+ state->aes256_ooo.unused_lanes = 0xFF03020100;
+ state->aes256_ooo.job_in_lane[0] = NULL;
+ state->aes256_ooo.job_in_lane[1] = NULL;
+ state->aes256_ooo.job_in_lane[2] = NULL;
+ state->aes256_ooo.job_in_lane[3] = NULL;
+
+ /* DOCSIS SEC BPI uses same settings as AES128 CBC */
+ state->docsis_sec_ooo.lens[0] = 0;
+ state->docsis_sec_ooo.lens[1] = 0;
+ state->docsis_sec_ooo.lens[2] = 0;
+ state->docsis_sec_ooo.lens[3] = 0;
+ state->docsis_sec_ooo.lens[4] = 0xFFFF;
+ state->docsis_sec_ooo.lens[5] = 0xFFFF;
+ state->docsis_sec_ooo.lens[6] = 0xFFFF;
+ state->docsis_sec_ooo.lens[7] = 0xFFFF;
+ state->docsis_sec_ooo.unused_lanes = 0xFF03020100;
+ state->docsis_sec_ooo.job_in_lane[0] = NULL;
+ state->docsis_sec_ooo.job_in_lane[1] = NULL;
+ state->docsis_sec_ooo.job_in_lane[2] = NULL;
+ state->docsis_sec_ooo.job_in_lane[3] = NULL;
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < SSE_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5*4] = 0x80;
+ p[64-2] = 0x02;
+ p[64-1] = 0xA0;
+ }
+
+#ifdef HASH_USE_SHAEXT
+ if (state->features & IMB_FEATURE_SHANI) {
+ /* Init HMAC/SHA1 NI out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFF0100;
+ }
+#endif /* HASH_USE_SHAEXT */
+
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < SSE_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[7*4] = 0x80; /* digest 7 words long */
+ p[64-2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64-1] = 0xE0;
+ }
+#ifdef HASH_USE_SHAEXT
+ if (state->features & IMB_FEATURE_SHANI) {
+ /* Init HMAC/SHA224 NI out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFF0100;
+ }
+#endif /* HASH_USE_SHAEXT */
+
+ /* Init HMAC/SHA_256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < SSE_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2); /* digest is 8*4 bytes long */
+ p[8*4] = 0x80;
+ p[64-2] = 0x03; /* length of (opad (64*8) bits + 256 bits)
+ * in hex is 0x300 */
+ p[64-1] = 0x00;
+ }
+#ifdef HASH_USE_SHAEXT
+ if (state->features & IMB_FEATURE_SHANI) {
+ /* Init HMAC/SHA256 NI out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFF0100;
+ }
+#endif /* HASH_USE_SHAEXT */
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_384_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < SSE_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */
+ /*
+ * hmac outer block length always of fixed size, it is OKey
+ * length, a whole message block length, 1024 bits, with padding
+ * plus the length of the inner digest, which is 384 bits
+ * 1408 bits == 0x0580. The input message block needs to be
+ * converted to big endian within the sha implementation
+ * before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_512_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < SSE_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */
+ /*
+ * hmac outer block length always of fixed size, it is OKey
+ * length, a whole message block length, 1024 bits, with padding
+ * plus the length of the inner digest, which is 512 bits
+ * 1536 bits == 0x600. The input message block needs to be
+ * converted to big endian within the sha implementation
+ * before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0xFFFF;
+ state->hmac_md5_ooo.lens[9] = 0xFFFF;
+ state->hmac_md5_ooo.lens[10] = 0xFFFF;
+ state->hmac_md5_ooo.lens[11] = 0xFFFF;
+ state->hmac_md5_ooo.lens[12] = 0xFFFF;
+ state->hmac_md5_ooo.lens[13] = 0xFFFF;
+ state->hmac_md5_ooo.lens[14] = 0xFFFF;
+ state->hmac_md5_ooo.lens[15] = 0xFFFF;
+ state->hmac_md5_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < SSE_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_md5_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ memset(p + (5 * 4) + 1,
+ 0x00,
+ 64 - (5 * 4) - 1 - 2);
+ p[4*4] = 0x80;
+ p[64-7] = 0x02;
+ p[64-8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0xFFFF;
+ state->aes_xcbc_ooo.lens[5] = 0xFFFF;
+ state->aes_xcbc_ooo.lens[6] = 0xFFFF;
+ state->aes_xcbc_ooo.lens[7] = 0xFFFF;
+ state->aes_xcbc_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < 4; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 4; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF3210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 4; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF3210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set SSE handlers */
+ state->get_next_job = get_next_job_sse;
+ state->submit_job = submit_job_sse;
+ state->submit_job_nocheck = submit_job_nocheck_sse;
+ state->get_completed_job = get_completed_job_sse;
+ state->flush_job = flush_job_sse;
+ state->queue_size = queue_size_sse;
+ state->keyexp_128 = aes_keyexp_128_sse;
+ state->keyexp_192 = aes_keyexp_192_sse;
+ state->keyexp_256 = aes_keyexp_256_sse;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_sse;
+ state->xcbc_keyexp = aes_xcbc_expand_key_sse;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_sse;
+ state->sha224_one_block = sha224_one_block_sse;
+ state->sha256_one_block = sha256_one_block_sse;
+ state->sha384_one_block = sha384_one_block_sse;
+ state->sha512_one_block = sha512_one_block_sse;
+ state->md5_one_block = md5_one_block_sse;
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm
new file mode 100644
index 00000000..ef9edafc
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm
@@ -0,0 +1,777 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute octal MD5 using SSE
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers xmm0-15
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+section .data align=64
+default rel
+
+align 64
+MKGLOBAL(MD5_TABLE,data,internal)
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+
+section .text
+
+%ifdef LINUX
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define mem1 rcx
+%define mem2 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define mem1 rdi
+%define mem2 rsi
+%endif
+
+;; rbp is not clobbered
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+%define TBL rax
+%define IDX rbx
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+; stack size must be an odd multiple of 8 bytes in size
+struc STACK
+_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: reso 8 ; stores AA-DD, AA2-DD2
+ resb 8 ; for alignment
+endstruc
+%define STACK_SIZE STACK_size
+
+%define AA rsp + _DIGEST + 16*0
+%define BB rsp + _DIGEST + 16*1
+%define CC rsp + _DIGEST + 16*2
+%define DD rsp + _DIGEST + 16*3
+%define AA2 rsp + _DIGEST + 16*4
+%define BB2 rsp + _DIGEST + 16*5
+%define CC2 rsp + _DIGEST + 16*6
+%define DD2 rsp + _DIGEST + 16*7
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movdqa %%t0, %%r0
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movdqa %%t1, %%r2
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movdqa %%r1, %%t0
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movdqa %%r3, %%r0
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pand %%F,%%X
+ pxor %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pxor %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,[rel ONES] ; pnot %%F
+ por %%F,%%X
+ pxor %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%tmp, (32-%%imm)
+ pslld %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ paddd %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ paddd %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ paddd %%A, %%FUN
+ paddd %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+;
+align 32
+MKGLOBAL(md5_x4x2_sse,function,internal)
+md5_x4x2_sse:
+
+ sub rsp, STACK_SIZE
+
+ ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
+ ;; Initialize digests
+ movdqa A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
+ movdqa B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
+ movdqa C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
+ movdqa D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ ;; Initialize digests
+ movdqa A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
+ movdqa B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
+ movdqa C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
+ movdqa D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ lea TBL, [rel MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1+_data_ptr_md5 +0*PTR_SZ]
+ mov inp1,[arg1+_data_ptr_md5 +1*PTR_SZ]
+ mov inp2,[arg1+_data_ptr_md5 +2*PTR_SZ]
+ mov inp3,[arg1+_data_ptr_md5 +3*PTR_SZ]
+ mov inp4,[arg1+_data_ptr_md5 +4*PTR_SZ]
+ mov inp5,[arg1+_data_ptr_md5 +5*PTR_SZ]
+ mov inp6,[arg1+_data_ptr_md5 +6*PTR_SZ]
+ mov inp7,[arg1+_data_ptr_md5 +7*PTR_SZ]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16],T0
+ movdqa [mem1+(I*4+1)*16],T1
+ movdqa [mem1+(I*4+2)*16],T2
+ movdqa [mem1+(I*4+3)*16],T3
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16 + 16*16],T0
+ movdqa [mem1+(I*4+1)*16 + 16*16],T1
+ movdqa [mem1+(I*4+2)*16 + 16*16],T2
+ movdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ movdqa [AA], A
+ movdqa [BB], B
+ movdqa [CC], C
+ movdqa [DD], D
+ ; save old digests
+ movdqa [AA2], A2
+ movdqa [BB2], B2
+ movdqa [CC2], C2
+ movdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; write out digests
+ movdqu [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A
+ movdqu [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B
+ movdqu [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C
+ movdqu [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D
+ movdqu [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
+ movdqu [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
+ movdqu [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
+ movdqu [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 +_data_ptr_md5 + 0*PTR_SZ], inp0
+ mov [arg1 +_data_ptr_md5 + 1*PTR_SZ], inp1
+ mov [arg1 +_data_ptr_md5 + 2*PTR_SZ], inp2
+ mov [arg1 +_data_ptr_md5 + 3*PTR_SZ], inp3
+ mov [arg1 +_data_ptr_md5 + 4*PTR_SZ], inp4
+ mov [arg1 +_data_ptr_md5 + 5*PTR_SZ], inp5
+ mov [arg1 +_data_ptr_md5 + 6*PTR_SZ], inp6
+ mov [arg1 +_data_ptr_md5 + 7*PTR_SZ], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm
new file mode 100644
index 00000000..ad36a999
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm
@@ -0,0 +1,425 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+;; rbx, rsi, rdi, rbp, r12-r15 left intact
+;; This version is not safe to call from C/C++
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ paddd %%regE,[rsp + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
+ pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ must be an odd multiple of 8
+%define FRAMESZ 16*16 + 8
+
+%define MOVPS movdqu
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks);
+; arg 1 : rcx : pointer to args
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_mult_sse,function,internal)
+sha1_mult_sse:
+
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
+ movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
+ movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
+ movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
+ movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
+ DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
+ DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3
+ xor IDX, IDX
+lloop:
+ movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ MOVPS T2,[inp0+IDX]
+ MOVPS T1,[inp1+IDX]
+ MOVPS T4,[inp2+IDX]
+ MOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3
+ pshufb T0, F
+ movdqa [rsp+(I*4+0)*16],T0
+ pshufb T1, F
+ movdqa [rsp+(I*4+1)*16],T1
+ pshufb T2, F
+ movdqa [rsp+(I*4+2)*16],T2
+ pshufb T3, F
+ movdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ movdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ movdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ movdqa K, [rel K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ movdqa K, [rel K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ movdqa K, [rel K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ paddd A,AA
+ paddd B,BB
+ paddd C,CC
+ paddd D,DD
+ paddd E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
+ movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
+ movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
+ movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
+ movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
+ DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
+ DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
new file mode 100644
index 00000000..e2b0fe85
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
@@ -0,0 +1,482 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RDX R10 R11
+;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RDI R10 R11
+;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+
+%include "os.asm"
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define args arg1
+%define NUM_BLKS arg2
+
+; reso = resdq => 16 bytes
+struc frame
+.ABCD_SAVE reso 1
+.E_SAVE reso 1
+.ABCD_SAVEb reso 1
+.E_SAVEb reso 1
+.align resq 1
+endstruc
+
+%define INP r10
+%define INPb r11
+
+%define ABCD xmm0
+%define E0 xmm1 ; Need two E's b/c they ping pong
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+
+%define ABCDb xmm7
+%define E0b xmm8 ; Need two E's b/c they ping pong
+%define E1b xmm9
+%define MSG0b xmm10
+%define MSG1b xmm11
+%define MSG2b xmm12
+%define MSG3b xmm13
+
+%define SHUF_MASK xmm14
+%define E_MASK xmm15
+
+section .data
+default rel
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f
+ dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000
+ dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks)
+;; arg1 : pointer to args
+;; arg2 : size (in blocks) ;; assumed to be >= 1
+
+section .text
+MKGLOBAL(sha1_ni,function,internal)
+align 32
+sha1_ni:
+ sub rsp, frame_size
+
+ DBGPRINTL "enter sha1-ni-x2"
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+
+ ;; load input pointers
+ mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ]
+ DBGPRINTL64 "jobA: pointer", INP
+ mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ]
+
+ add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition
+
+ ;; load initial digest
+ movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE]
+ pxor E0, E0
+ pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+ pshufd ABCD, ABCD, 0x1B
+
+ DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD
+ DBGPRINTL_XMM "jobA: digest in word 4", E0
+
+ movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE]
+ pxor E0b, E0b
+ pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+ pshufd ABCDb, ABCDb, 0x1B
+
+ movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ movdqa E_MASK, [rel UPPER_WORD_MASK]
+
+ DBGPRINTL "jobA data:"
+loop0:
+ ;; Copy digests
+ movdqa [rsp + frame.ABCD_SAVE], ABCD
+ movdqa [rsp + frame.E_SAVE], E0
+ movdqa [rsp + frame.ABCD_SAVEb], ABCDb
+ movdqa [rsp + frame.E_SAVEb], E0b
+
+ ;; Only needed if not using sha1nexte for rounds 0-3
+ pand E0, E_MASK
+ pand E0b, E_MASK
+
+ ;; Needed if using sha1nexte for rounds 0-3
+ ;; Need to rotate E right by 30
+ ;movdqa E1, E0
+ ;psrld E0, 30
+ ;pslld E1, 2
+ ;pxor E0, E1
+
+ ;; Rounds 0-3
+ movdqu MSG0, [INP + 0*16]
+ pshufb MSG0, SHUF_MASK
+ DBGPRINT_XMM MSG0
+ ;sha1nexte E0, MSG0
+ paddd E0, MSG0 ; instead of sha1nexte
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ movdqu MSG0b, [INPb + 0*16]
+ pshufb MSG0b, SHUF_MASK
+ ;sha1nexte E0b, MSG0b
+ paddd E0b, MSG0b ; instead of sha1nexte
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+
+ ;; Rounds 4-7
+ movdqu MSG1, [INP + 1*16]
+ pshufb MSG1, SHUF_MASK
+ DBGPRINT_XMM MSG1
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+ movdqu MSG1b, [INPb + 1*16]
+ pshufb MSG1b, SHUF_MASK
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG0b, MSG1b
+
+ ;; Rounds 8-11
+ movdqu MSG2, [INP + 2*16]
+ pshufb MSG2, SHUF_MASK
+ DBGPRINT_XMM MSG2
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ movdqu MSG2b, [INPb + 2*16]
+ pshufb MSG2b, SHUF_MASK
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 12-15
+ movdqu MSG3, [INP + 3*16]
+ pshufb MSG3, SHUF_MASK
+ DBGPRINT_XMM MSG3
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ movdqu MSG3b, [INPb + 3*16]
+ pshufb MSG3b, SHUF_MASK
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+
+ ;; Rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 3
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 3
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 3
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 3
+
+ ;; Rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 3
+
+ ;; Need to rotate E left by 30
+ movdqa E1, E0
+ pslld E0, 30
+ psrld E1, 2
+ pxor E0, E1
+ movdqa E1b, E0b
+ pslld E0b, 30
+ psrld E1b, 2
+ pxor E0b, E1b
+
+ paddd ABCD, [rsp + frame.ABCD_SAVE]
+ paddd E0, [rsp + frame.E_SAVE]
+ paddd ABCDb, [rsp + frame.ABCD_SAVEb]
+ paddd E0b, [rsp + frame.E_SAVEb]
+
+ add INP, 64
+ add INPb, 64
+ cmp INP, NUM_BLKS
+ jne loop0
+
+ ;; write out digests
+ pshufd ABCD, ABCD, 0x1B
+ movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD
+ pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3
+ DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD
+ DBGPRINTL_XMM "jobA: digest out word 4", E0
+
+ pshufd ABCDb, ABCDb, 0x1B
+ movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb
+ pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3
+
+ ;; update input pointers
+ mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP
+ mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb
+
+done_hash:
+ add rsp, frame_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
new file mode 100644
index 00000000..aaa5a58f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
@@ -0,0 +1,507 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; SHA1 code, hybrid, rolled, interleaved
+; Uses SSE instructions
+%include "os.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+
+%ifdef LINUX
+%define INP rdi ; 1st arg
+%define CTX rsi ; 2nd arg
+%define REG3 ecx
+%define REG4 edx
+%else
+%define INP rcx ; 1st arg
+%define CTX rdx ; 2nd arg
+%define REG3 edi
+%define REG4 esi
+%endif
+
+%define FRAMESZ 3*16 + 1*8
+%define _RSP FRAMESZ-1*8 + rsp
+
+%define a eax
+%define b ebx
+%define c REG3
+%define d REG4
+%define e r8d
+%define T1 r9d
+%define f r10d
+%define RND r11d
+%define g r12d
+%define h r13d
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XK xmm2
+
+%xdefine X0 xmm3
+%xdefine X1 xmm4
+%xdefine X2 xmm5
+%xdefine X3 xmm6
+%xdefine X4 xmm7
+
+%define XFER xmm8
+
+%define SZ 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X_
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regC
+ xor %%regF,%%regD
+ and %%regF,%%regB
+ xor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regD
+ xor %%regF,%%regC
+ xor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regB
+ mov %%regT,%%regB
+ or %%regF,%%regC
+ and %%regT,%%regC
+ and %%regF,%%regD
+ or %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+;; input is T1
+%macro ROUND 1
+%define %%MAGIC %1
+ add e,T1
+ mov T1,a
+ rol T1,5
+ add e,T1
+ %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+%endmacro
+
+%macro do_4i 1
+ movdqa XFER, XK
+ paddd XFER, X0
+ pextrd T1, XFER, 0
+ ;ROUND %1
+ add e,T1
+ ;SCHEDULE_4
+ movdqa XTMP0, X1
+ palignr XTMP0, X0, 8 ; XTMP0 = W[-14]
+ mov T1,a
+ movdqa XTMP1, X2
+ rol T1,5
+ pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16]
+ add e,T1
+ pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+
+ ;; Finish low half
+ movdqa X4, X3
+ rol b,30
+ psrldq X4, 4 ; X4 = W[-3] {xxBA}
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 1
+ ;ROUND %1
+ add e,T1
+ pxor X4, XTMP0 ;
+ mov T1,a
+ movdqa XTMP1, X4
+ rol T1,5
+ ;; rotate X4 left 1
+ psrld XTMP1, (32-1)
+ add e,T1
+ pslld X4, 1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ pxor X4, XTMP1 ; X4 = W[0] {xxBA}
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 2
+ ;ROUND %1
+ add e,T1
+ movdqa XTMP1, X4
+ mov T1,a
+
+ ;; Finish high half
+ palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx}
+ rol T1,5
+ add e,T1
+ pxor XTMP0, XTMP1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; rotate XTMP0 left 1
+ movdqa XTMP1, XTMP0
+ psrld XTMP1, (32-1)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 3
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+ pslld XTMP0, 1
+ rol T1,5
+ add e,T1
+ pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; COMBINE HALVES
+ shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
+ rol b,30
+ add h,e
+
+ rotate_Xs
+ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_one_block_sse(void *input_data, UINT32 digest[8]
+;; arg 1 : rcx : pointer to input data
+;; arg 2 : rdx : pointer to digest
+MKGLOBAL(sha1_one_block_sse,function,)
+align 32
+sha1_one_block_sse:
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+
+ ;; byte swap first 16 dwords
+ movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ mov rax,rsp ; copy rsp
+ MOVDQ X0, [INP + 0*16]
+ sub rsp,FRAMESZ
+ MOVDQ X1, [INP + 1*16]
+ and rsp,-64 ; align stack frame
+
+ movdqa [rsp + 0 * 16], xmm6
+ movdqa [rsp + 1 * 16], xmm7
+ movdqa [rsp + 2 * 16], xmm8
+
+ MOVDQ X2, [INP + 2*16]
+ mov [_RSP],rax ; save copy of rsp
+ MOVDQ X3, [INP + 3*16]
+ ;; load initial digest
+ mov a,0x67452301
+ pshufb X0, XTMP0
+ mov b,0xefcdab89
+ pshufb X1, XTMP0
+ mov c,0x98badcfe
+ pshufb X2, XTMP0
+ mov d,0x10325476
+ pshufb X3, XTMP0
+ mov e,0xc3d2e1f0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K00_19]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop1_5
+align 16
+loop1:
+
+ do_4i MAGIC_F0
+
+loop1_5:
+ do_4i MAGIC_F0
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K20_39]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop2_5
+align 16
+loop2:
+
+ do_4i MAGIC_F1
+
+loop2_5:
+ do_4i MAGIC_F1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K40_59]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop3_5
+align 16
+loop3:
+
+ do_4i MAGIC_F2
+
+loop3_5:
+ do_4i MAGIC_F2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop3
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;; do rounds 60-79
+ movdqa XK, [rel K60_79]
+
+ do_4i MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X0
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X1
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X2
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X3
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ add a,0x67452301
+ mov [SZ*0 + CTX], a
+ add b,0xefcdab89
+ mov [SZ*1 + CTX], b
+ add c,0x98badcfe
+ mov [SZ*2 + CTX], c
+ add d,0x10325476
+ mov [SZ*3 + CTX], d
+ add e,0xc3d2e1f0
+ mov [SZ*4 + CTX], e
+
+ movdqa xmm8, [rsp + 2 * 16]
+ movdqa xmm7, [rsp + 1 * 16]
+ movdqa xmm6, [rsp + 0 * 16]
+
+ mov rsp,[_RSP]
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm
new file mode 100644
index 00000000..26c887a9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm
@@ -0,0 +1,41 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define H0 0xc1059ed8
+%define H1 0x367cd507
+%define H2 0x3070dd17
+%define H3 0xf70e5939
+%define H4 0xffc00b31
+%define H5 0x68581511
+%define H6 0x64f98fa7
+%define H7 0xbefa4fa4
+%define FUNC sha224_one_block_sse
+
+%include "sha256_one_block_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm
new file mode 100644
index 00000000..d2eab4f1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm
@@ -0,0 +1,604 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RCX RDX RSI RDI R11
+;; Windows preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RCX RDX RSI RDI R11
+;; Linux preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+
+%include "os.asm"
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+; resdq = res0 => 16 bytes
+struc frame
+.ABEF_SAVE reso 1
+.CDGH_SAVE reso 1
+.ABEF_SAVEb reso 1
+.CDGH_SAVEb reso 1
+.align resq 1
+endstruc
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define args arg1
+%define NUM_BLKS arg2
+
+%define INP arg3
+%define INPb arg4
+
+
+%define SHA256CONSTANTS r11
+
+;; MSG MUST be xmm0 (implicit argument)
+%define MSG xmm0
+%define STATE0 xmm1
+%define STATE1 xmm2
+%define MSGTMP0 xmm3
+%define MSGTMP1 xmm4
+%define MSGTMP2 xmm5
+%define MSGTMP3 xmm6
+%define MSGTMP4 xmm7
+
+%define STATE0b xmm8
+%define STATE1b xmm9
+%define MSGTMP0b xmm10
+%define MSGTMP1b xmm11
+%define MSGTMP2b xmm12
+%define MSGTMP3b xmm13
+%define MSGTMP xmm14
+
+%define SHUF_MASK xmm15
+
+section .data
+default rel
+align 64
+K256:
+ dd 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ dd 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ dd 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ dd 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ dd 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ dd 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ dd 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ dd 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ dd 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ dd 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ dd 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ dd 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ dd 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_ni(SHA256_ARGS *args, UINT32 size_in_blocks)
+;; arg1 : pointer to args
+;; arg2 : size (in blocks) ;; assumed to be >= 1
+section .text
+MKGLOBAL(sha256_ni,function,internal)
+align 32
+sha256_ni:
+ sub rsp, frame_size
+
+ DBGPRINTL "enter sha256-ni-x2"
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+
+ DBGPRINTL64 "jobA/B byte size:", NUM_BLKS
+
+ ;; load input pointers
+ mov INP, [args + _data_ptr_sha256 + 0*PTR_SZ]
+ mov INPb, [args + _data_ptr_sha256 + 1*PTR_SZ]
+
+ add NUM_BLKS, INP ; pointer to end of data
+
+ ;; load initial digest
+ ;; Probably need to reorder these appropriately
+ ;; DCBA, HGFE -> ABEF, CDGH
+
+ movdqu STATE0, [args + 0*SHA256NI_DIGEST_ROW_SIZE]
+ movdqu STATE1, [args + 0*SHA256NI_DIGEST_ROW_SIZE + 16]
+ movdqu STATE0b, [args + 1*SHA256NI_DIGEST_ROW_SIZE]
+ movdqu STATE1b, [args + 1*SHA256NI_DIGEST_ROW_SIZE + 16]
+ DBGPRINTL "jobA digest in:"
+ DBGPRINT_XMM STATE0
+ DBGPRINT_XMM STATE1
+ DBGPRINTL "jobB digest in:"
+ DBGPRINT_XMM STATE0b
+ DBGPRINT_XMM STATE1b
+
+ pshufd STATE0, STATE0, 0xB1 ; CDAB
+ pshufd STATE1, STATE1, 0x1B ; EFGH
+ movdqa MSGTMP4, STATE0
+ pshufd STATE0b, STATE0b, 0xB1 ; CDAB
+ pshufd STATE1b, STATE1b, 0x1B ; EFGH
+ movdqa MSGTMP, STATE0b
+ palignr STATE0, STATE1, 8 ; ABEF
+ palignr STATE0b, STATE1b, 8 ; ABEF
+ pblendw STATE1, MSGTMP4, 0xF0 ; CDGH
+ pblendw STATE1b, MSGTMP, 0xF0 ; CDGH
+
+ lea SHA256CONSTANTS,[rel K256]
+ movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+%ifdef DO_DBGPRINT
+ ;; prin buffer A
+ push r10
+ push NUM_BLKS
+ DBGPRINTL "jobA data:"
+ xor r10, r10
+ sub NUM_BLKS, INP
+.loop_dbgA:
+ movdqu MSG, [INP + r10 + 0*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INP + r10 + 1*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INP + r10 + 2*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INP + r10 + 3*16]
+ DBGPRINT_XMM MSG
+ add r10, 64
+ cmp NUM_BLKS, r10
+ jne .loop_dbgA
+ pop NUM_BLKS
+ pop r10
+%endif
+
+%ifdef DO_DBGPRINT
+ ;; prin buffer B
+ push r10
+ push NUM_BLKS
+ DBGPRINTL "jobB data:"
+ xor r10, r10
+ sub NUM_BLKS, INP
+.loop_dbgB:
+ movdqu MSG, [INPb + r10 + 0*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INPb + r10 + 1*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INPb + r10 + 2*16]
+ DBGPRINT_XMM MSG
+ movdqu MSG, [INPb + r10 + 3*16]
+ DBGPRINT_XMM MSG
+ add r10, 64
+ cmp NUM_BLKS, r10
+ jne .loop_dbgB
+ pop NUM_BLKS
+ pop r10
+%endif
+
+.loop0:
+ ;; Save digests
+ movdqa [rsp + frame.ABEF_SAVE], STATE0
+ movdqa [rsp + frame.CDGH_SAVE], STATE1
+ movdqa [rsp + frame.ABEF_SAVEb], STATE0b
+ movdqa [rsp + frame.CDGH_SAVEb], STATE1b
+
+ ;; Rounds 0-3
+ movdqu MSG, [INP + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0, MSG
+ paddd MSG, [SHA256CONSTANTS + 0*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqu MSG, [INPb + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0b, MSG
+ paddd MSG, [SHA256CONSTANTS + 0*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+
+ ;; Rounds 4-7
+ movdqu MSG, [INP + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1, MSG
+ paddd MSG, [SHA256CONSTANTS + 1*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqu MSG, [INPb + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1b, MSG
+ paddd MSG, [SHA256CONSTANTS + 1*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP0, MSGTMP1
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ;; Rounds 8-11
+ movdqu MSG, [INP + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2, MSG
+ paddd MSG, [SHA256CONSTANTS + 2*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqu MSG, [INPb + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2b, MSG
+ paddd MSG, [SHA256CONSTANTS + 2*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP1, MSGTMP2
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ;; Rounds 12-15
+ movdqu MSG, [INP + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3, MSG
+ paddd MSG, [SHA256CONSTANTS + 3*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3
+ palignr MSGTMP, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqu MSG, [INPb + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3b, MSG
+ paddd MSG, [SHA256CONSTANTS + 3*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3b
+ palignr MSGTMP, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP2, MSGTMP3
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ;; Rounds 16-19
+ movdqa MSG, MSGTMP0
+ paddd MSG, [SHA256CONSTANTS + 4*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0
+ palignr MSGTMP, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [SHA256CONSTANTS + 4*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0b
+ palignr MSGTMP, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP3, MSGTMP0
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ;; Rounds 20-23
+ movdqa MSG, MSGTMP1
+ paddd MSG, [SHA256CONSTANTS + 5*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1
+ palignr MSGTMP, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [SHA256CONSTANTS + 5*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1b
+ palignr MSGTMP, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP0, MSGTMP1
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ;; Rounds 24-27
+ movdqa MSG, MSGTMP2
+ paddd MSG, [SHA256CONSTANTS + 6*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2
+ palignr MSGTMP, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [SHA256CONSTANTS + 6*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2b
+ palignr MSGTMP, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP1, MSGTMP2
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ;; Rounds 28-31
+ movdqa MSG, MSGTMP3
+ paddd MSG, [SHA256CONSTANTS + 7*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3
+ palignr MSGTMP, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [SHA256CONSTANTS + 7*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3b
+ palignr MSGTMP, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP2, MSGTMP3
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ;; Rounds 32-35
+ movdqa MSG, MSGTMP0
+ paddd MSG, [SHA256CONSTANTS + 8*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0
+ palignr MSGTMP, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [SHA256CONSTANTS + 8*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0b
+ palignr MSGTMP, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP3, MSGTMP0
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ;; Rounds 36-39
+ movdqa MSG, MSGTMP1
+ paddd MSG, [SHA256CONSTANTS + 9*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1
+ palignr MSGTMP, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [SHA256CONSTANTS + 9*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1b
+ palignr MSGTMP, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP0, MSGTMP1
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ;; Rounds 40-43
+ movdqa MSG, MSGTMP2
+ paddd MSG, [SHA256CONSTANTS + 10*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2
+ palignr MSGTMP, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [SHA256CONSTANTS + 10*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2b
+ palignr MSGTMP, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP1, MSGTMP2
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ;; Rounds 44-47
+ movdqa MSG, MSGTMP3
+ paddd MSG, [SHA256CONSTANTS + 11*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3
+ palignr MSGTMP, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [SHA256CONSTANTS + 11*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP3b
+ palignr MSGTMP, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP2, MSGTMP3
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ;; Rounds 48-51
+ movdqa MSG, MSGTMP0
+ paddd MSG, [SHA256CONSTANTS + 12*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0
+ palignr MSGTMP, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [SHA256CONSTANTS + 12*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP0b
+ palignr MSGTMP, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+ sha256msg1 MSGTMP3, MSGTMP0
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ;; Rounds 52-55
+ movdqa MSG, MSGTMP1
+ paddd MSG, [SHA256CONSTANTS + 13*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1
+ palignr MSGTMP, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [SHA256CONSTANTS + 13*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP1b
+ palignr MSGTMP, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+
+ ;; Rounds 56-59
+ movdqa MSG, MSGTMP2
+ paddd MSG, [SHA256CONSTANTS + 14*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2
+ palignr MSGTMP, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [SHA256CONSTANTS + 14*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ movdqa MSGTMP, MSGTMP2b
+ palignr MSGTMP, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+
+ ;; Rounds 60-63
+ movdqa MSG, MSGTMP3
+ paddd MSG, [SHA256CONSTANTS + 15*16]
+ sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [SHA256CONSTANTS + 15*16]
+ sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument
+
+ paddd STATE0, [rsp + frame.ABEF_SAVE]
+ paddd STATE1, [rsp + frame.CDGH_SAVE]
+ paddd STATE0b, [rsp + frame.ABEF_SAVEb]
+ paddd STATE1b, [rsp + frame.CDGH_SAVEb]
+
+ add INP, 64
+ add INPb, 64
+ cmp INP, NUM_BLKS
+ jne .loop0
+
+ ;; update data pointers
+ mov [args + _data_ptr_sha256 + 0*PTR_SZ], INP
+ mov [args + _data_ptr_sha256 + 1*PTR_SZ], INPb
+
+ ; Reorder for writeback
+ pshufd STATE0, STATE0, 0x1B ; FEBA
+ pshufd STATE1, STATE1, 0xB1 ; DCHG
+ movdqa MSGTMP4, STATE0
+ pshufd STATE0b, STATE0b, 0x1B ; FEBA
+ pshufd STATE1b, STATE1b, 0xB1 ; DCHG
+ movdqa MSGTMP, STATE0b
+ pblendw STATE0, STATE1, 0xF0 ; DCBA
+ pblendw STATE0b, STATE1b, 0xF0 ; DCBA
+ palignr STATE1, MSGTMP4, 8 ; HGFE
+ palignr STATE1b, MSGTMP, 8 ; HGFE
+
+ ;; update digests
+ movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0
+ movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1
+ movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0b
+ movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1b
+
+ DBGPRINTL "jobA digest out:"
+ DBGPRINT_XMM STATE0
+ DBGPRINT_XMM STATE1
+ DBGPRINTL "jobB digest out:"
+ DBGPRINT_XMM STATE0b
+ DBGPRINT_XMM STATE1b
+
+done_hash:
+ DBGPRINTL "exit sha256-ni-x2"
+
+ add rsp, frame_size
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm
new file mode 100644
index 00000000..9369d188
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm
@@ -0,0 +1,517 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "os.asm"
+
+section .data
+default rel
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+ dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+; shuffle xDxC -> DC00
+_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+ dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
+
+section .text
+
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ MOVDQ %1, %2
+ pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER xmm9
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c ecx
+%define d r8d
+%define e edx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c edi
+%define d esi
+%define e r8d
+
+%endif
+%define TBL rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 7
+%endif
+_XFER: reso 1
+endstruc
+
+%ifndef H0
+%define H0 0x6a09e667
+%define H1 0xbb67ae85
+%define H2 0x3c6ef372
+%define H3 0xa54ff53a
+%define H4 0x510e527f
+%define H5 0x9b05688c
+%define H6 0x1f83d9ab
+%define H7 0x5be0cd19
+%define FUNC sha256_one_block_sse
+%endif
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ movdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
+ movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pslld XTMP1, (32-7)
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ psrld XTMP2, 7
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ pslld XTMP3, (32-18)
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ psrld XTMP2, 18
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP1, XTMP3
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pxor XTMP1, XTMP4 ; XTMP1 = s0
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
+ pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ pxor XTMP2, XTMP3
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
+ pxor X0, XTMP2 ; X0 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT32 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ movdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ movdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ movdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ movdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ movdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ movdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ movdqa [rsp + _XMM_SAVE + 6*16],xmm12
+%endif
+
+ ;; load initial digest
+ mov a,H0
+ mov b,H1
+ mov c,H2
+ mov d,H3
+ mov e,H4
+ mov f,H5
+ mov g,H6
+ mov h,H7
+
+ movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ movdqa SHUF_00BA, [rel _SHUF_00BA]
+ movdqa SHUF_DC00, [rel _SHUF_DC00]
+
+ lea TBL,[rel K256]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+align 16
+loop1:
+ movdqa XFER, [TBL + 0*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 1*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 2*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 3*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ paddd X0, [TBL + 0*16]
+ movdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd X1, [TBL + 1*16]
+ movdqa [rsp + _XFER], X1
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X0, X2
+ movdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ add a,H0
+ add b,H1
+ add c,H2
+ add d,H3
+ add e,H4
+ add f,H5
+ add g,H6
+ mov [4*0 + CTX],a
+ mov [4*1 + CTX],b
+ mov [4*2 + CTX],c
+ mov [4*3 + CTX],d
+ mov [4*4 + CTX],e
+ mov [4*5 + CTX],f
+ mov [4*6 + CTX],g
+ add h,H7
+ mov [4*7 + CTX],h
+
+done_hash:
+%ifndef LINUX
+ movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+%endif
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm
new file mode 100644
index 00000000..51a031fd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm
@@ -0,0 +1,41 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define H0 0xcbbb9d5dc1059ed8
+%define H1 0x629a292a367cd507
+%define H2 0x9159015a3070dd17
+%define H3 0x152fecd8f70e5939
+%define H4 0x67332667ffc00b31
+%define H5 0x8eb44a8768581511
+%define H6 0xdb0c2e0d64f98fa7
+%define H7 0x47b5481dbefa4fa4
+%define FUNC sha384_one_block_sse
+
+%include "sha512_one_block_sse.asm"
diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm
new file mode 100644
index 00000000..a42e942f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm
@@ -0,0 +1,495 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "os.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+
+%ifndef FUNC
+%define FUNC sha512_one_block_sse
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ MOVDQ %1, %2
+ pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+%define X4 xmm8
+%define X5 xmm9
+%define X6 xmm10
+%define X7 xmm11
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XFER xmm13
+
+%define BYTE_FLIP_MASK xmm12
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c rcx
+%define d r8
+%define e rdx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c rdi
+%define d rsi
+%define e r8
+
+%endif
+%define TBL rbp
+%define a rax
+%define b rbx
+
+%define f r9
+%define g r10
+%define h r11
+
+%define y0 r13
+%define y1 r14
+%define y2 r15
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 8
+%endif
+_XFER: reso 1
+endstruc
+
+%ifndef H0
+%define H0 0x6a09e667f3bcc908
+%define H1 0xbb67ae8584caa73b
+%define H2 0x3c6ef372fe94f82b
+%define H3 0xa54ff53a5f1d36f1
+%define H4 0x510e527fade682d1
+%define H5 0x9b05688c2b3e6c1f
+%define H6 0x1f83d9abfb41bd6b
+%define H7 0x5be0cd19137e2179
+%endif
+
+; rotate_Xs
+; Rotate values of symbols X0...X7
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X5
+%xdefine X5 X6
+%xdefine X6 X7
+%xdefine X7 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro TWO_ROUNDS_AND_SCHED 0
+
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ movdqa XTMP0, X5
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ ror y0, (41-18) ; y0 = e >> (41-18)
+ palignr XTMP0, X4, 8 ; XTMP0 = W[-7]
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ ror y1, (39-34) ; y1 = a >> (39-34)
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ movdqa XTMP1, X1
+ ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ xor y2, g ; y2 = f^g
+ paddq XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ and y2, e ; y2 = (f^g)&e
+ ;; compute s0
+ palignr XTMP1, X0, 8 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
+ ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH
+ ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ psllq XTMP1, (64-1)
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ psrlq XTMP2, 1
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = t1 + S0
+ movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
+ psrlq XTMP2, 8
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ movdqa X0, XTMP3 ; X0 = W[-15]
+ psllq XTMP3, (64-8)
+
+
+ROTATE_ARGS
+ pxor XTMP1, XTMP3
+ psrlq X0, 7 ; X0 = W[-15] >> 7
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8
+ ror y0, (41-18) ; y0 = e >> (41-18)
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ pxor XTMP1, X0 ; XTMP1 = s0
+ ror y1, (39-34) ; y1 = a >> (39-34)
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ ;; compute s1
+ movdqa XTMP2, X7 ; XTMP2 = W[-2]
+ ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ xor y2, g ; y2 = f^g
+ paddq XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2]
+ movdqa X0, XTMP2 ; X0 = W[-2]
+ and y2, e ; y2 = (f^g)&e
+ ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ psllq XTMP3, (64-19)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH
+ psrlq X0, 19
+ ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ por XTMP3, X0 ; XTMP3 = W[-2] ror 19
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ movdqa X0, XTMP2 ; X0 = W[-2]
+ movdqa XTMP1, XTMP2 ; XTMP1 = W[-2]
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ psllq X0, (64-61)
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = t1 + S0
+ psrlq XTMP1, 61
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ por X0, XTMP1 ; X0 = W[-2] ror 61
+ psrlq XTMP2, 6 ; XTMP2 = W[-2] >> 6
+ pxor XTMP2, XTMP3
+ pxor X0, XTMP2 ; X0 = s1
+ paddq X0, XTMP0 ; X0 = {W[1], W[0]}
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 8]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ ror y0, (41-18) ; y0 = e >> (41-18)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ ror y1, (39-34) ; y1 = a >> (39-34)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6))
+ ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+section .data
+default rel
+align 64
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+h0: dq H0
+h1: dq H1
+h2: dq H2
+h3: dq H3
+h4: dq H4
+h5: dq H5
+h6: dq H6
+h7: dq H7
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT64 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ movdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ movdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ movdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ movdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ movdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ movdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ movdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ movdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ ;; load initial digest
+ mov a,[rel h0]
+ mov b,[rel h1]
+ mov c,[rel h2]
+ mov d,[rel h3]
+ mov e,[rel h4]
+ mov f,[rel h5]
+ mov g,[rel h6]
+ mov h,[rel h7]
+
+ movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ lea TBL,[rel K512]
+
+ ;; byte swap first 16 qwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK
+
+ ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds
+ mov SRND, 4
+align 16
+loop1:
+
+%assign i 0
+%rep 7
+ movdqa XFER, X0
+ paddq XFER, [TBL + i*16]
+ movdqa [rsp + _XFER], XFER
+ TWO_ROUNDS_AND_SCHED
+%assign i (i+1)
+%endrep
+
+ movdqa XFER, X0
+ paddq XFER, [TBL + 7*16]
+ movdqa [rsp + _XFER], XFER
+ add TBL, 8*16
+ TWO_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+ jmp loop2a
+loop2:
+ movdqa X0, X4
+ movdqa X1, X5
+ movdqa X2, X6
+ movdqa X3, X7
+
+loop2a:
+ paddq X0, [TBL + 0*16]
+ movdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+
+ paddq X1, [TBL + 1*16]
+ movdqa [rsp + _XFER], X1
+ DO_ROUND 0
+ DO_ROUND 1
+
+ paddq X2, [TBL + 2*16]
+ movdqa [rsp + _XFER], X2
+ DO_ROUND 0
+ DO_ROUND 1
+
+ paddq X3, [TBL + 3*16]
+ movdqa [rsp + _XFER], X3
+ add TBL, 4*16
+ DO_ROUND 0
+ DO_ROUND 1
+
+ sub SRND, 1
+ jne loop2
+
+ add a,[rel h0]
+ add b,[rel h1]
+ add c,[rel h2]
+ add d,[rel h3]
+ add e,[rel h4]
+ add f,[rel h5]
+ add g,[rel h6]
+ mov [8*0 + CTX],a
+ mov [8*1 + CTX],b
+ mov [8*2 + CTX],c
+ mov [8*3 + CTX],d
+ mov [8*4 + CTX],e
+ mov [8*5 + CTX],f
+ mov [8*6 + CTX],g
+ add h,[rel h7]
+ mov [8*7 + CTX],h
+
+done_hash:
+%ifndef LINUX
+ movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ movdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%endif
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm
new file mode 100644
index 00000000..e4533053
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm
@@ -0,0 +1,439 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+section .data
+default rel
+align 64
+MKGLOBAL(K512_2,data,internal)
+K512_2:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ movapd %%t0, %%r0 ; t0 = a1 a0
+ shufpd %%r0, %%r1, 00b ; r0 = b0 a0
+ shufpd %%t0, %%r1, 11b ; t0 = b1 a1
+%endm
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psllq %%tmp, (64-(%%imm))
+ psrlq %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORQ a0, (18-14) ; sig1: a0 = (e >> 4)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORQ a1, 41 ; sig1: a1 = (e >> 41)
+ movdqa [SZ2*(%%i&0xf) + rsp],%%T1
+ paddq %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ paddq h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORQ a2, (34-28) ; sig0: a2 = (a >> 6)
+ paddq h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORQ a1, 39 ; sig0: a1 = (a >> 39)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddq h, a0
+
+ paddq d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddq h, a1 ; h = h + ch + W + K + maj
+ paddq h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ2*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ movdqa a2, a1
+ PRORQ a1, 61-19
+ pxor %%T1, a0
+ PRORQ %%T1, 1
+ pxor a1, a2
+ PRORQ a1, 19
+ psrlq a0, 7
+ pxor %%T1, a0
+ psrlq a2, 6
+ pxor a1, a2
+ paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp]
+ paddq a1, [SZ2*((%%i-7)&0xf) + rsp]
+ paddq %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+
+
+;; SHA512_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[2];
+;;
+
+;; void sha512_x2_sse(SHA512_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha512_x2_sse,function,internal)
+align 32
+sha512_x2_sse:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ DBGPRINTL_XMM "incoming transposed sha512 digest", a, b, c, d, e, f, g, h
+ lea TBL,[rel K512_2]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+ DBGPRINTL64 "lloop enter INP_SIZE ", INP_SIZE
+ DBGPRINTL64 " IDX = ", IDX
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ2], a
+ movdqa [rsp + _DIGEST + 1*SZ2], b
+ movdqa [rsp + _DIGEST + 2*SZ2], c
+ movdqa [rsp + _DIGEST + 3*SZ2], d
+ movdqa [rsp + _DIGEST + 4*SZ2], e
+ movdqa [rsp + _DIGEST + 5*SZ2], f
+ movdqa [rsp + _DIGEST + 6*SZ2], g
+ movdqa [rsp + _DIGEST + 7*SZ2], h
+
+ DBGPRINTL "incoming data["
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ MOVPD TT2,[inp1+IDX+i*16]
+ DBGPRINTL_XMM "input message block", TT0
+ TRANSPOSE TT0, TT2, TT1
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+ DBGPRINTL "]"
+ add IDX, 8 * 16 ;; increment by a message block
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddq a, [rsp + _DIGEST + 0*SZ2]
+ paddq b, [rsp + _DIGEST + 1*SZ2]
+ paddq c, [rsp + _DIGEST + 2*SZ2]
+ paddq d, [rsp + _DIGEST + 3*SZ2]
+ paddq e, [rsp + _DIGEST + 4*SZ2]
+ paddq f, [rsp + _DIGEST + 5*SZ2]
+ paddq g, [rsp + _DIGEST + 6*SZ2]
+ paddq h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+ movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+ movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+ movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+ movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+ movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+ movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+ movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+ DBGPRINTL_XMM "exit transposed digest ", a, b, c, d, e, f, g, h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+DBGPRINTL "====================== exit sha512_x2_sse code =====================\n"
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm
new file mode 100644
index 00000000..3af8511d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm
@@ -0,0 +1,447 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA256 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r12 r14 r15
+;;
+;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "os.asm"
+%include "mb_mgr_datastruct.asm"
+
+;%define DO_DBGPRINT
+%include "dbgprint.asm"
+
+section .data
+default rel
+align 64
+MKGLOBAL(K256_4,data,internal)
+K256_4:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ4
+
+; Define stack usage
+struc STACK
+_DATA: resb SZ4 * 16
+_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPS movups
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ paddd a1, [SZ4*((%%i-7)&0xf) + rsp]
+ paddd %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+
+
+;; SHA256_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[4];
+;;
+
+;; void sha_256_mult_sse(SHA256_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha_256_mult_sse,function,internal)
+align 32
+sha_256_mult_sse:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ movdqa a,[STATE + 0 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa b,[STATE + 1 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa c,[STATE + 2 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa d,[STATE + 3 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa e,[STATE + 4 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa f,[STATE + 5 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa g,[STATE + 6 * SHA256_DIGEST_ROW_SIZE ]
+ movdqa h,[STATE + 7 * SHA256_DIGEST_ROW_SIZE ]
+
+ DBGPRINTL_XMM "incoming transposed sha256 digest", a, b, c, d, e, f, g, h
+ lea TBL,[rel K256_4]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
+ DBGPRINTL64 "incoming input data ptrs ", inp0, inp1, inp2, inp3
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ4], a
+ movdqa [rsp + _DIGEST + 1*SZ4], b
+ movdqa [rsp + _DIGEST + 2*SZ4], c
+ movdqa [rsp + _DIGEST + 3*SZ4], d
+ movdqa [rsp + _DIGEST + 4*SZ4], e
+ movdqa [rsp + _DIGEST + 5*SZ4], f
+ movdqa [rsp + _DIGEST + 6*SZ4], g
+ movdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPS TT2,[inp0+IDX+i*16]
+ MOVPS TT1,[inp1+IDX+i*16]
+ MOVPS TT4,[inp2+IDX+i*16]
+ MOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ pshufb TT2, TMP
+ pshufb TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddd a, [rsp + _DIGEST + 0*SZ4]
+ paddd b, [rsp + _DIGEST + 1*SZ4]
+ paddd c, [rsp + _DIGEST + 2*SZ4]
+ paddd d, [rsp + _DIGEST + 3*SZ4]
+ paddd e, [rsp + _DIGEST + 4*SZ4]
+ paddd f, [rsp + _DIGEST + 5*SZ4]
+ paddd g, [rsp + _DIGEST + 6*SZ4]
+ paddd h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ movdqa [STATE+0*SHA256_DIGEST_ROW_SIZE ],a
+ movdqa [STATE+1*SHA256_DIGEST_ROW_SIZE ],b
+ movdqa [STATE+2*SHA256_DIGEST_ROW_SIZE ],c
+ movdqa [STATE+3*SHA256_DIGEST_ROW_SIZE ],d
+ movdqa [STATE+4*SHA256_DIGEST_ROW_SIZE ],e
+ movdqa [STATE+5*SHA256_DIGEST_ROW_SIZE ],f
+ movdqa [STATE+6*SHA256_DIGEST_ROW_SIZE ],g
+ movdqa [STATE+7*SHA256_DIGEST_ROW_SIZE ],h
+ DBGPRINTL_XMM "updated transposed sha256 digest", a, b, c, d, e, f, g, h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha256 + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha256 + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha256 + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha256 + 3*8], inp3
+
+ DBGPRINTL64 "updated input data ptrs ", inp0, inp1, inp2, inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif