summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx')
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm306
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm606
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm328
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm504
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm344
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm516
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm494
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm501
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm536
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm165
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm654
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm418
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm2515
-rw-r--r--src/spdk/intel-ipsec-mb/avx/kasumi_avx.c386
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm537
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm518
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm239
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm194
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm264
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm272
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c733
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm298
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm321
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm355
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm356
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm428
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm339
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm416
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm358
-rw-r--r--src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm716
-rw-r--r--src/spdk/intel-ipsec-mb/avx/pon_avx.asm1170
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm434
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm501
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm33
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm553
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm33
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm473
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm381
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm391
-rw-r--r--src/spdk/intel-ipsec-mb/avx/snow3g_avx.c42
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/avx/zuc_avx.asm1146
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/avx/zuc_avx_top.c548
55 files changed, 20721 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..a4de936ff
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm
@@ -0,0 +1,306 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES128 CBC decrypt "by8"
+
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey2 xmm10
+%define xkey4 xmm11
+%define xkey6 xmm12
+%define xkey8 xmm13
+%define xkey10 xmm14
+%define xkeytmp xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey2, [p_keys + 2*16]
+%endif
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+ vmovdqa xkeytmp, [p_keys + 1*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey2
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey10, [p_keys + 10*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey10
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_128_avx,function,internal)
+aes_cbc_dec_128_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey2, [p_keys + 2*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey10, [p_keys + 10*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm
new file mode 100644
index 000000000..4d08bfde5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to compute CBC-MAC. It is based on 128 bit CBC AES encrypt code.
+
+%define CBC_MAC 1
+%include "avx/aes_cbc_enc_128_x8.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm
new file mode 100644
index 000000000..d46a29192
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm
@@ -0,0 +1,606 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES128 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+section .data
+default rel
+
+%ifndef CNTR_CCM_AVX
+MKGLOBAL(byteswap_const,data,internal)
+MKGLOBAL(set_byte15,data,internal)
+MKGLOBAL(ddq_add_1,data,internal)
+MKGLOBAL(ddq_add_2,data,internal)
+MKGLOBAL(ddq_add_3,data,internal)
+MKGLOBAL(ddq_add_4,data,internal)
+MKGLOBAL(ddq_add_5,data,internal)
+MKGLOBAL(ddq_add_6,data,internal)
+MKGLOBAL(ddq_add_7,data,internal)
+MKGLOBAL(ddq_add_8,data,internal)
+%endif ;; CNTR_CCM_AVX
+
+align 16
+byteswap_const: ;DDQ 0x000102030405060708090A0B0C0D0E0F
+ DQ 0x08090A0B0C0D0E0F, 0x0001020304050607
+set_byte15: DQ 0x0000000000000000, 0x0100000000000000
+
+ddq_add_1: ;DDQ 0x00000000000000000000000000000001
+ DQ 0x0000000000000001, 0x0000000000000000
+ddq_add_2: ;DDQ 0x00000000000000000000000000000002
+ DQ 0x0000000000000002, 0x0000000000000000
+ddq_add_3: ;DDQ 0x00000000000000000000000000000003
+ DQ 0x0000000000000003, 0x0000000000000000
+ddq_add_4: ;DDQ 0x00000000000000000000000000000004
+ DQ 0x0000000000000004, 0x0000000000000000
+ddq_add_5: ;DDQ 0x00000000000000000000000000000005
+ DQ 0x0000000000000005, 0x0000000000000000
+ddq_add_6: ;DDQ 0x00000000000000000000000000000006
+ DQ 0x0000000000000006, 0x0000000000000000
+ddq_add_7: ;DDQ 0x00000000000000000000000000000007
+ DQ 0x0000000000000007, 0x0000000000000000
+ddq_add_8: ;DDQ 0x00000000000000000000000000000008
+ DQ 0x0000000000000008, 0x0000000000000000
+
+section .text
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey3 xmm11
+%define xkey6 xmm12
+%define xkey9 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef CNTR_CCM_AVX
+%ifdef LINUX
+%define job rdi
+%define p_in rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define p_ivlen r9
+%else ;; LINUX
+%define job rcx
+%define p_in rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define p_ivlen rax
+%endif ;; LINUX
+%define p_IV r11
+%else ;; CNTR_CCM_AVX
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else ;; LINUX
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif ;; LINUX
+%endif ;; CNTR_CCM_AVX
+
+%define tmp r11
+%define flags r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+ vmovdqa xkeyB, [p_keys + 4*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey3 ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 4
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey6 ; key 6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey9 ; key 9
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifidn %%CNTR_TYPE, CCM
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+ mov p_ivlen, [job + _iv_len_in_bytes]
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+ mov p_keys, [job + _aes_enc_key_expanded]
+ mov p_out, [job + _dst]
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+ ;; Prepare IV ;;
+
+ ;; Byte 0: flags with L'
+ ;; Calculate L' = 15 - Nonce length - 1 = 14 - IV length
+ mov flags, 14
+ sub flags, p_ivlen
+ vmovd xcounter, DWORD(flags)
+ ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+ ;; Bytes 1 - 7 are always copied (first 7 bytes)
+ mov p_IV, [job + _iv]
+ vpinsrb xcounter, [p_IV], 1
+ vpinsrw xcounter, [p_IV + 1], 1
+ vpinsrd xcounter, [p_IV + 3], 1
+
+ cmp p_ivlen, 7
+ je _finish_nonce_move
+
+ cmp p_ivlen, 8
+ je _iv_length_8
+ cmp p_ivlen, 9
+ je _iv_length_9
+ cmp p_ivlen, 10
+ je _iv_length_10
+ cmp p_ivlen, 11
+ je _iv_length_11
+ cmp p_ivlen, 12
+ je _iv_length_12
+
+ ;; Bytes 8 - 13
+_iv_length_13:
+ vpinsrb xcounter, [p_IV + 12], 13
+_iv_length_12:
+ vpinsrb xcounter, [p_IV + 11], 12
+_iv_length_11:
+ vpinsrd xcounter, [p_IV + 7], 2
+ jmp _finish_nonce_move
+_iv_length_10:
+ vpinsrb xcounter, [p_IV + 9], 10
+_iv_length_9:
+ vpinsrb xcounter, [p_IV + 8], 9
+_iv_length_8:
+ vpinsrb xcounter, [p_IV + 7], 8
+
+_finish_nonce_move:
+ ; last byte = 1
+ vpor xcounter, [rel set_byte15]
+%else ;; CNTR/CNTR_BIT
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5] ; arg5
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%endif ;; CNTR/CNTR_BIT/CCM
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CCM
+ mov rax, job
+ or dword [rax + _status], STS_COMPLETED_AES
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 9
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_128_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_128_avx,function,internal)
+aes_cntr_ccm_128_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_128_avx,function,internal)
+aes_cntr_128_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_128_avx,function,internal)
+aes_cntr_bit_128_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm
new file mode 100644
index 000000000..1a4c11602
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm
@@ -0,0 +1,32 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define CNTR_CCM_AVX
+%ifndef AES_CNTR_CCM_128
+%define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+%endif
+%include "avx/aes128_cntr_by8_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..9952c2552
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm
@@ -0,0 +1,328 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES192 CBC decrypt "by8"
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey3 xmm10
+%define xkey6 xmm11
+%define xkey9 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 4*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 5*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 10*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 11*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_192_avx,function,internal)
+aes_cbc_dec_192_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm
new file mode 100644
index 000000000..e926b4413
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm
@@ -0,0 +1,504 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES192 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const
+extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 11
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_192_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_192_avx,function,internal)
+aes_cntr_ccm_192_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_192_avx,function,internal)
+aes_cntr_192_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_192_avx,function,internal)
+aes_cntr_bit_192_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..6a8f100ec
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm
@@ -0,0 +1,344 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES256 CBC decrypt "by8"
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey3 xmm10
+%define xkey6 xmm11
+%define xkey9 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 4*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 5*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 10*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 11*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 13*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 14*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_256_avx,function,internal)
+aes_cbc_dec_256_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm
new file mode 100644
index 000000000..e201339da
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm
@@ -0,0 +1,516 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES256 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const
+extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 13*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 14*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 13
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_256_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_256_avx,function,internal)
+aes_cntr_ccm_256_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_256_avx,function,internal)
+aes_cntr_256_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_256_avx,function,internal)
+aes_cntr_bit_256_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm
new file mode 100644
index 000000000..745a8e4d4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm
@@ -0,0 +1,494 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 128 bit CBC AES encrypt and CBC MAC
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 8
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 arg3
+%define KEYS1 arg4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+%ifdef CBC_MAC
+MKGLOBAL(aes128_cbc_mac_x8,function,internal)
+aes128_cbc_mac_x8:
+%else
+MKGLOBAL(aes_cbc_enc_128_x8,function,internal)
+aes_cbc_enc_128_x8:
+%endif
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+%ifdef CBC_MAC
+ mov [GPR_SAVE_AREA + 8*1], rbx
+ mov [GPR_SAVE_AREA + 8*2], r12
+ mov [GPR_SAVE_AREA + 8*3], r13
+ mov [GPR_SAVE_AREA + 8*4], r14
+ mov [GPR_SAVE_AREA + 8*5], r15
+%ifndef LINUX
+ mov [GPR_SAVE_AREA + 8*6], rsi
+ mov [GPR_SAVE_AREA + 8*7], rdi
+%endif
+%endif
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+%endif
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ ;; no ciphertext write back for CBC-MAC
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+%endif
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV for AES128-CBC / store digest for CBC-MAC
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+%ifndef CBC_MAC
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+%endif
+
+ ;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+%ifdef CBC_MAC
+ mov rbx, [GPR_SAVE_AREA + 8*1]
+ mov r12, [GPR_SAVE_AREA + 8*2]
+ mov r13, [GPR_SAVE_AREA + 8*3]
+ mov r14, [GPR_SAVE_AREA + 8*4]
+ mov r15, [GPR_SAVE_AREA + 8*5]
+%ifndef LINUX
+ mov rsi, [GPR_SAVE_AREA + 8*6]
+ mov rdi, [GPR_SAVE_AREA + 8*7]
+%endif
+%endif
+
+ add rsp, STACK_size
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
new file mode 100644
index 000000000..e446f13c3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
@@ -0,0 +1,501 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+
+MKGLOBAL(aes_cbc_enc_192_x8,function,internal)
+aes_cbc_enc_192_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm
new file mode 100644
index 000000000..75cf285d9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm
@@ -0,0 +1,536 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+MKGLOBAL(aes_cbc_enc_256_x8,function,internal)
+aes_cbc_enc_256_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+ vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC
+ vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC
+ vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC
+ vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC
+
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+ vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC
+ vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC
+ vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC
+ vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC
+
+
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm
new file mode 100644
index 000000000..34d03bb99
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm
@@ -0,0 +1,165 @@
+;;
+;; Copyright (c) 2018-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/clear_regs.asm"
+
+;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only.
+;;; It processes only one buffer at a time.
+;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX R9 R10
+;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0
+;;
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + 5*8]
+%endif
+
+%define OUT arg1
+%define IN arg2
+%define IV arg3
+%define KEYS arg4
+%ifdef LINUX
+%define LEN arg5
+%else
+%define LEN2 arg5
+%define LEN r11
+%endif
+
+%define TMP0 rax
+%define TMP1 r10
+
+%define XDATA xmm0
+%define XIN xmm1
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys, uint64_t len)
+;; arg 1: OUT : addr to put clear/cipher text out
+;; arg 2: IN : addr to take cipher/clear text from
+;; arg 3: IV : initialization vector
+;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned)
+;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16)
+;;
+;; AES CFB128 one block encrypt/decrypt implementation.
+;; The function doesn't update IV. The result of operation can be found in OUT.
+;;
+;; It is primarly designed to process partial block of
+;; DOCSIS 3.1 AES Packet PDU Encryption (I.10)
+;;
+;; It process up to one block only (up to 16 bytes).
+;;
+;; It makes sure not to read more than LEN bytes from IN and
+;; not to store more than LEN bytes to OUT.
+MKGLOBAL(aes_cfb_128_one_avx,function,)
+MKGLOBAL(aes_cfb_128_one_avx2,function,)
+MKGLOBAL(aes_cfb_128_one_avx512,function,)
+align 32
+aes_cfb_128_one_avx:
+aes_cfb_128_one_avx2:
+aes_cfb_128_one_avx512:
+%ifndef LINUX
+ mov LEN, LEN2
+%endif
+%ifdef SAFE_PARAM
+ cmp IV, 0
+ jz exit_cfb
+
+ cmp KEYS, 0
+ jz exit_cfb
+
+ cmp LEN, 0
+ jz skip_in_out_check
+
+ cmp OUT, 0
+ jz exit_cfb
+
+ cmp IN, 0
+ jz exit_cfb
+
+skip_in_out_check:
+%endif
+ simd_load_avx_16 XIN, IN, LEN
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu XDATA, [IV] ; IV (or next to last block)
+ vpxor XDATA, XDATA, [KEYS + 16*0] ; 0. ARK
+ vaesenc XDATA, XDATA, [KEYS + 16*1] ; 1. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*2] ; 2. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*3] ; 3. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*4] ; 4. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*5] ; 5. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*6] ; 6. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*7] ; 7. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*8] ; 8. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*9] ; 9. ENC
+ vaesenclast XDATA, XDATA, [KEYS + 16*10] ; 10. ENC
+
+ vpxor XDATA, XIN ; plaintext/ciphertext XOR block cipher encryption
+
+ simd_store_avx OUT, XDATA, LEN, TMP0, TMP1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifdef SAFE_DATA
+ ;; XDATA and XIN are the only scratch SIMD registers used
+ clear_xmms_avx XDATA, XIN
+ clear_scratch_gps_asm
+%endif
+exit_cfb:
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm
new file mode 100644
index 000000000..d71bd8c46
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm
@@ -0,0 +1,654 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_ecb_x_y_avx(void *in,
+; UINT128 keys[],
+; void *out,
+; UINT64 len_bytes);
+;
+; x = direction (enc/dec)
+; y = key size (128/192/256)
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: KEYS: pointer to keys
+; arg 3: OUT: pointer to output (plain text)
+; arg 4: LEN: length in bytes (multiple of 16)
+;
+
+%include "include/os.asm"
+
+%ifndef AES_ECB_ENC_128
+%define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+%define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+%define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+%define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+%define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+%define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+%endif
+
+%ifdef LINUX
+%define IN rdi
+%define KEYS rsi
+%define OUT rdx
+%define LEN rcx
+%else
+%define IN rcx
+%define KEYS rdx
+%define OUT r8
+%define LEN r9
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY10 xmm8
+%define XKEY_A xmm9
+%define XKEY_B xmm10
+
+section .text
+
+%macro AES_ECB 2
+%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value
+%define %%DIR %2 ; [in] Direction (encrypt/decrypt)
+
+%ifidn %%DIR, ENC
+%define AES vaesenc
+%define AES_LAST vaesenclast
+%else ; DIR = DEC
+%define AES vaesdec
+%define AES_LAST vaesdeclast
+%endif
+ mov TMP, LEN
+ and TMP, 3*16
+ jz %%initial_4
+ cmp TMP, 2*16
+ jb %%initial_1
+ ja %%initial_3
+
+%%initial_2:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+ AES XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+ AES XDATA1, [KEYS + 3*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+ AES XDATA1, [KEYS + 5*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+ AES XDATA1, [KEYS + 7*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+
+ vmovdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+ AES XDATA1, [KEYS + 9*16]
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+ AES XDATA1, XKEY10
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+ AES XDATA1, [KEYS + 11*16]
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES XDATA1, [KEYS + 12*16]
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+ AES XDATA1, [KEYS + 13*16]
+%endif
+
+%if %%NROUNDS == 10
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+ AES_LAST XDATA1, XKEY10
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES_LAST XDATA1, [KEYS + 12*16]
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+ AES_LAST XDATA1, [KEYS + 14*16]
+%endif
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_1:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+
+ mov IDX, 1*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+
+ AES XDATA0, XKEY4 ; 4. ENC
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+
+ AES XDATA0, XKEY6 ; 6. ENC
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+
+ AES XDATA0, XKEY_B ; 8. ENC
+
+ vmovdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+%endif
+
+%if %%NROUNDS == 10
+
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+%endif
+
+ vmovdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je %%done
+ jmp %%main_loop
+
+
+%%initial_3:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+ vmovdqu XDATA2, [IN + 2*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+ mov IDX, 3*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+ vmovdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_4:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+ vmovdqu XDATA2, [IN + 2*16]
+ vmovdqu XDATA3, [IN + 3*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+ vpxor XDATA3, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+ vmovdqu [OUT + 2*16], XDATA2
+ vmovdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz %%done
+ jmp %%main_loop
+
+ align 16
+%%main_loop:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + IDX + 0*16]
+ vmovdqu XDATA1, [IN + IDX + 1*16]
+ vmovdqu XDATA2, [IN + IDX + 2*16]
+ vmovdqu XDATA3, [IN + IDX + 3*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+ vpxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ vmovdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ vmovdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ vmovdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ vmovdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ cmp IDX, LEN
+ jne %%main_loop
+
+%%done:
+
+ ret
+
+%endmacro
+
+align 16
+MKGLOBAL(AES_ECB_ENC_128,function,internal)
+AES_ECB_ENC_128:
+
+ AES_ECB 10, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_192,function,internal)
+AES_ECB_ENC_192:
+
+ AES_ECB 12, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_256,function,internal)
+AES_ECB_ENC_256:
+
+ AES_ECB 14, ENC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_128,function,internal)
+AES_ECB_DEC_128:
+
+ AES_ECB 10, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_192,function,internal)
+AES_ECB_DEC_192:
+
+ AES_ECB 12, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_256,function,internal)
+AES_ECB_DEC_256:
+
+ AES_ECB 14, DEC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm
new file mode 100644
index 000000000..615e19050
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm
@@ -0,0 +1,418 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do 128 bit AES XCBC
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_XCBC_ARGS_x8 {
+;; void* in[8];
+;; UINT128* keys[8];
+;; UINT128 ICV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_xcbc_mac_128_x8(AES_XCBC_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+MKGLOBAL(aes_xcbc_mac_128_x8,function,internal)
+aes_xcbc_mac_128_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesxcbcarg_in + 8*0]
+ mov IN2, [ARG + _aesxcbcarg_in + 8*2]
+ mov IN4, [ARG + _aesxcbcarg_in + 8*4]
+ mov IN6, [ARG + _aesxcbcarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesxcbcarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV
+ VPXOR2 XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV
+ VPXOR2 XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV
+ VPXOR2 XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV
+ VPXOR2 XDATA4, [ARG + _aesxcbcarg_ICV + 16*4] ; plaintext XOR ICV
+ VPXOR2 XDATA5, [ARG + _aesxcbcarg_ICV + 16*5] ; plaintext XOR ICV
+ VPXOR2 XDATA6, [ARG + _aesxcbcarg_ICV + 16*6] ; plaintext XOR ICV
+ VPXOR2 XDATA7, [ARG + _aesxcbcarg_ICV + 16*7] ; plaintext XOR ICV
+
+ mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesxcbcarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesxcbcarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesxcbcarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesxcbcarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesxcbcarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update ICV
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*4], XDATA4
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*5], XDATA5
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*6], XDATA6
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*7], XDATA7
+
+ ;; update IN
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesxcbcarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesxcbcarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesxcbcarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesxcbcarg_in + 16*3]
+ vmovdqa [ARG + _aesxcbcarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesxcbcarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesxcbcarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesxcbcarg_in + 16*3], xmm4
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm
new file mode 100644
index 000000000..1bb601e4f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm
new file mode 100644
index 000000000..4de59d5bf
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm
new file mode 100644
index 000000000..de8eadf4c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm
@@ -0,0 +1,30 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define GCM256_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
new file mode 100644
index 000000000..2aa3a162d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
@@ -0,0 +1,2515 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_sse_avx.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen2.asm!"
+%endif
+%endif
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+section .text
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 15
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%GDATA_KEY %4
+%define %%XTMP0 %5 ; xmm temp reg 5
+%define %%XTMP1 %6 ; xmm temp reg 5
+%define %%XTMP2 %7
+%define %%XTMP3 %8
+%define %%XTMP4 %9
+%define %%XTMP5 %10 ; xmm temp reg 5
+%define %%T1 %11 ; temp reg 1
+%define %%T2 %12
+%define %%T3 %13
+%define %%T4 %14
+%define %%T5 %15 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu %%XTMP0, [%%T1 + 16*0]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+ vmovdqu %%XTMP0, [%%T1 + 16*i]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+ ;; calculate hash_key position to start with
+ mov %%T3, %%T2
+ and %%T3, -16 ; 1 to 7 blocks possible here
+ neg %%T3
+ add %%T3, HashKey_1 + 16
+ lea %%T3, [%%GDATA_KEY + %%T3]
+
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16 ; move to next data block
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+
+%%_AAD_blocks:
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+ jmp %%_AAD_blocks
+
+%%_AAD_reduce:
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ or %%T2, %%T2
+ je %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+ vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input:
+; GDATA_KEY - struct gcm_key_data *
+; GDATA_CTX - struct gcm_context_data *
+; PLAIN_CYPH_IN - input text
+; PLAIN_CYPH_LEN - input text length
+; DATA_OFFSET - the current data offset
+; ENC_DEC - whether encoding or decoding
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ cmp r13, rax
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_encode_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %8
+%define %%HASH_KEY %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+
+%assign i (8-%%num_initial_blocks)
+ vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep ; NROUNDS
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and
+; store cipher/plain text.
+; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8)
+; - combine current GHASH value into block 0 (XMM1)
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+%ifdef GCM128_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+
+%ifidn %%ENC_DEC, ENC
+%ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+%else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+%endif ; NT_LD
+ vaesenclast reg(j), reg(j), %%T2
+%else
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
+%endif ; %%ENC_DEC
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+; %%GDATA is GCM key data
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+; %%GDATA is GCM key data
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep ; NROUNDS
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
+; IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm0
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ vpinsrq xmm2, [r10], 0
+ vpinsrd xmm2, [r10+8], 2
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [rel SHUF_MASK]
+
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
+; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], rax ; Update length of data processed
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed
+%endif
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx_gen2
+; (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_precomp:
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_init:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+exit_update_enc:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+exit_update_dec:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_enc_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_dec_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+exit_enc:
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+exit_dec:
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c
new file mode 100644
index 000000000..4739191ac
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c
@@ -0,0 +1,386 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <limits.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "include/save_xmms.h"
+#include "include/kasumi_internal.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+void
+kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (cipherLengthInBytes == 0 ||
+ cipherLengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_1_buffer(pCtx, IV, pBufferIn, pBufferOut,
+ cipherLengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (cipherLengthInBits == 0 ||
+ cipherLengthInBits > KASUMI_MAX_LEN)
+ return;
+#endif
+ kasumi_f8_1_buffer_bit(pCtx, IV, pBufferIn, pBufferOut,
+ cipherLengthInBits, offsetInBits);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const void *pBufferIn1,
+ void *pBufferOut1, const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes1 == 0 || lengthInBytes1 > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+
+ if (lengthInBytes2 == 0 || lengthInBytes2 > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_2_buffer(pCtx, IV1, IV2,
+ pBufferIn1, pBufferOut1, lengthInBytes1,
+ pBufferIn2, pBufferOut2, lengthInBytes2);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_3_buffer(pCtx, IV1, IV2, IV3,
+ pBufferIn1, pBufferOut1,
+ pBufferIn2, pBufferOut2,
+ pBufferIn3, pBufferOut3, lengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+ return;
+
+ if (pBufferIn4 == NULL || pBufferOut4 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_4_buffer(pCtx, IV1, IV2, IV3, IV4,
+ pBufferIn1, pBufferOut1,
+ pBufferIn2, pBufferOut2,
+ pBufferIn3, pBufferOut3,
+ pBufferIn4, pBufferOut4,
+ lengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+ uint32_t numLeft = dataCount;
+ const uint64_t *IVPtr;
+ const void * const *pDataInPtr;
+ void **pDataOutPtr;
+ const uint32_t *dataLenPtr;
+ uint32_t i = 0;
+ uint32_t numBuffs;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKeySchedule == NULL || pDataIn == NULL || pDataOut == NULL ||
+ dataLen == NULL || IV == NULL)
+ return;
+
+ for (i = 0; i < dataCount; i++) {
+ /* Check for NULL pointers */
+ if (pDataIn[i] == NULL || pDataOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (dataLen[i] == 0 || dataLen[i] > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+ }
+#endif
+
+ i = 0;
+
+ /* KASUMI F8 n buffer function can handle up to 16 buffers */
+ while (numLeft > 0) {
+ IVPtr = &IV[i];
+ pDataInPtr = &pDataIn[i];
+ pDataOutPtr = &pDataOut[i];
+ dataLenPtr = &dataLen[i];
+ numBuffs = (numLeft > 16) ? 16 : numLeft;
+
+ kasumi_f8_n_buffer(pKeySchedule, IVPtr, pDataInPtr, pDataOutPtr,
+ dataLenPtr, numBuffs);
+ i += numBuffs;
+ numLeft -= numBuffs;
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+
+void
+kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx, const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f9_1_buffer(pCtx, pBufferIn, lengthInBytes, pDigest);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBits == 0 || lengthInBits > KASUMI_MAX_LEN)
+ return;
+#endif
+ kasumi_f9_1_buffer_user(pCtx, IV, pBufferIn, lengthInBits,
+ pDigest, direction);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+int
+kasumi_init_f8_key_sched_avx(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_init_f8_key_sched(pKey, pCtx);
+}
+
+int
+kasumi_init_f9_key_sched_avx(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_init_f9_key_sched(pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_avx(void)
+{
+ return kasumi_key_sched_size();
+}
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm
new file mode 100644
index 000000000..3e3de0492
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_avx
+%include "avx/mb_mgr_aes_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm
new file mode 100644
index 000000000..57fae603c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_avx
+%include "avx/mb_mgr_aes_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm
new file mode 100644
index 000000000..04c4824d7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_avx
+%include "avx/mb_mgr_aes_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm
new file mode 100644
index 000000000..ee1de7165
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_avx
+%include "avx/mb_mgr_aes_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
new file mode 100644
index 000000000..9d132ec5f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
@@ -0,0 +1,537 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%include "include/memcpy.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x8
+%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_mask:
+ dq 0xFFFFFFFFFFFFFFF0
+align 16
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ dq 0x0100010001000100, 0x0100010001000100
+counter_mask:
+ dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%define NROUNDS 9 ; AES-CCM-128
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+%define tmp4 rax
+%define auth_len_aad rax
+
+%define min_idx rbp
+%define flags rbp
+
+%define lane r8
+
+%define iv_len r9
+%define auth_len r9
+
+%define aad_len r10
+%define init_block_addr r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp r12
+%define tmp2 r13
+%define tmp3 r14
+
+%define good_lane r15
+%define min_job r15
+
+%define init_block0 xmm0
+%define ccm_lens xmm1
+%define min_len_idx xmm2
+%define xtmp0 xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+;;; ===========================================================================
+;;; AES CCM auth job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 15
+ shr unused_lanes, 4
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_ccm_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_ccm_args_keys + lane*8], tmp
+ ;; init_done = 0
+ mov word [state + _aes_ccm_init_done + lane*2], 0
+ lea tmp, [lane * 8]
+
+ vpxor init_block0, init_block0
+ vmovdqa [state + _aes_ccm_args_IV + tmp*2], init_block0
+
+ ;; Prepare initial Block 0 for CBC-MAC-128
+
+ ;; Byte 0: flags with L' and M' (AAD later)
+ ;; Calculate L' = 15 - IV length - 1 = 14 - IV length
+ mov flags, 14
+ mov iv_len, [job + _iv_len_in_bytes]
+ sub flags, iv_len
+ ;; Calculate M' = (Digest length - 2) / 2
+ mov tmp, [job + _auth_tag_output_len_in_bytes]
+ sub tmp, 2
+
+ shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl)
+ or flags, tmp
+
+ ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+ ;; Bytes 1 - 7 are always copied (first 7 bytes)
+ mov tmp, [job + _iv]
+ vpinsrb init_block0, [tmp], 1
+ vpinsrw init_block0, [tmp + 1], 1
+ vpinsrd init_block0, [tmp + 3], 1
+
+ cmp iv_len, 7
+ je %%_finish_nonce_move
+
+ cmp iv_len, 8
+ je %%_iv_length_8
+ cmp iv_len, 9
+ je %%_iv_length_9
+ cmp iv_len, 10
+ je %%_iv_length_10
+ cmp iv_len, 11
+ je %%_iv_length_11
+ cmp iv_len, 12
+ je %%_iv_length_12
+
+ ;; Bytes 8 - 13
+%%_iv_length_13:
+ vpinsrb init_block0, [tmp + 12], 13
+%%_iv_length_12:
+ vpinsrb init_block0, [tmp + 11], 12
+%%_iv_length_11:
+ vpinsrd init_block0, [tmp + 7], 2
+ jmp %%_finish_nonce_move
+%%_iv_length_10:
+ vpinsrb init_block0, [tmp + 9], 10
+%%_iv_length_9:
+ vpinsrb init_block0, [tmp + 8], 9
+%%_iv_length_8:
+ vpinsrb init_block0, [tmp + 7], 8
+
+%%_finish_nonce_move:
+
+ ;; Bytes 14 & 15 (message length), in Big Endian
+ mov ax, [job + _msg_len_to_hash_in_bytes]
+ xchg al, ah
+ vpinsrw init_block0, ax, 7
+
+ mov aad_len, [job + _cbcmac_aad_len]
+ ;; Initial length to authenticate (Block 0)
+ mov auth_len, 16
+ ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded,
+ ;; so length is multiple of 64B)
+ lea auth_len_aad, [aad_len + (2 + 15) + 16]
+ and auth_len_aad, -16
+
+ or aad_len, aad_len
+ cmovne auth_len, auth_len_aad
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ vphminposuw min_len_idx, ccm_lens
+
+ mov tmp, lane
+ shl tmp, 6
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp]
+ or aad_len, aad_len
+ je %%_aad_complete
+
+ or flags, (1 << 6) ; Set Adata bit in flags
+
+ ;; Copy AAD
+ ;; Set all 0s in last block (padding)
+ lea tmp, [init_block_addr + auth_len]
+ sub tmp, 16
+ vpxor xtmp0, xtmp0
+ vmovdqa [tmp], xtmp0
+
+ ;; Start copying from second block
+ lea tmp, [init_block_addr+16]
+ mov rax, aad_len
+ xchg al, ah
+ mov [tmp], ax
+ add tmp, 2
+ mov tmp2, [job + _cbcmac_aad]
+ memcpy_avx_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3
+
+%%_aad_complete:
+
+ ;; Finish Block 0 with Byte 0
+ vpinsrb init_block0, BYTE(flags), 0
+ vmovdqa [init_block_addr], init_block0
+
+ mov [state + _aes_ccm_args_in + lane * 8], init_block_addr
+
+ cmp byte [state + _aes_ccm_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 35
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp QWORD [state + _aes_ccm_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_ccm_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_ccm_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_ccm_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_ccm_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; Copy good_lane to empty lanes
+ movzx tmp, word [state + _aes_ccm_init_done + good_lane*2]
+ mov tmp2, [state + _aes_ccm_args_in + good_lane*8]
+ mov tmp3, [state + _aes_ccm_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xtmp0, [state + _aes_ccm_args_IV + good_lane]
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ vpor ccm_lens, [rel len_masks + 16*I]
+ mov [state + _aes_ccm_init_done + I*2], WORD(tmp)
+ mov [state + _aes_ccm_args_in + I*8], tmp2
+ mov [state + _aes_ccm_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ ;; Find min length
+ vphminposuw min_len_idx, ccm_lens
+
+%endif ; end FLUSH
+
+%%_ccm_round:
+ vpextrw len2, min_len_idx, 0 ; min value
+ vpextrw min_idx, min_len_idx, 1 ; min index (0...7)
+
+ mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8]
+
+ or len2, len2
+ je %%_len_is_0
+ ;; subtract min length from all lengths
+ vpshufb min_len_idx, min_len_idx, [rel dupw] ; broadcast min length
+ vpsubw ccm_lens, min_len_idx
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and min_idx are intact
+
+%%_len_is_0:
+
+ movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2]
+ cmp WORD(tmp), 0
+ je %%_prepare_full_blocks_to_auth
+ cmp WORD(tmp), 1
+ je %%_prepare_partial_block_to_auth
+
+%%_encrypt_digest:
+
+ ;; Set counter block 0 (reusing previous initial block 0)
+ mov tmp, min_idx
+ shl tmp, 3
+ vmovdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8]
+
+ vpand init_block0, [rel counter_mask]
+
+ mov tmp2, [state + _aes_ccm_args_keys + tmp]
+ ENCRYPT_SINGLE_BLOCK tmp2, init_block0
+ vpxor init_block0, [state + _aes_ccm_args_IV + tmp * 2]
+
+ ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16)
+ mov min_job, [state + _aes_ccm_job_in_lane + tmp]
+ mov tmp3, [min_job + _auth_tag_output_len_in_bytes]
+ mov tmp2, [min_job + _auth_tag_output]
+
+ simd_store_avx tmp2, init_block0, tmp3, tmp, tmp4
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, min_idx
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, min_job
+
+ mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ vpxor xtmp0, xtmp0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ shl min_idx, 3
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job
+ vmovdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + min_idx], 0
+%else
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD
+ ;; of returned job and "NULL lanes"
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + I*8], 0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SUBMIT
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%%_prepare_full_blocks_to_auth:
+
+ cmp dword [min_job + _cipher_direction], 2 ; DECRYPT
+ je %%_decrypt
+
+%%_encrypt:
+ mov tmp, [min_job + _src]
+ add tmp, [min_job + _hash_start_src_offset_in_bytes]
+ jmp %%_set_init_done_1
+
+%%_decrypt:
+ mov tmp, [min_job + _dst]
+
+%%_set_init_done_1:
+ mov [state + _aes_ccm_args_in + min_idx*8], tmp
+ mov word [state + _aes_ccm_init_done + min_idx*2], 1
+
+ ; Check if there are full blocks to hash
+ mov tmp, [min_job + _msg_len_to_hash_in_bytes]
+ and tmp, -16
+ je %%_prepare_partial_block_to_auth
+
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ jmp %%_ccm_round
+
+%%_prepare_partial_block_to_auth:
+ ; Check if partial block needs to be hashed
+ mov auth_len, [min_job + _msg_len_to_hash_in_bytes]
+ and auth_len, 15
+ je %%_encrypt_digest
+
+ mov word [state + _aes_ccm_init_done + min_idx * 2], 2
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ mov tmp2, min_idx
+ shl tmp2, 6
+ add tmp2, 16 ; pb[AES_BLOCK_SIZE]
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2]
+ mov tmp2, [state + _aes_ccm_args_in + min_idx * 8]
+
+ simd_load_avx_15_1 xtmp0, tmp2, auth_len
+
+%%_finish_partial_block_copy:
+ vmovdqa [init_block_addr], xtmp0
+ mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr
+
+ jmp %%_ccm_round
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal)
+SUBMIT_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal)
+FLUSH_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm
new file mode 100644
index 000000000..e17023004
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm
@@ -0,0 +1,518 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%define AES128_CBC_MAC aes128_cbc_mac_x8
+%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+%define m_last r10
+%define n r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp3 r12
+%define tmp4 r13
+%define tmp2 r14
+
+%define good_lane r15
+%define rbits r15
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; AES CMAC job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_cmac_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _key_expanded]
+ mov [state + _aes_cmac_args_keys + lane*8], tmp
+ mov tmp, lane
+ shl tmp, 4 ; lane*16
+
+ ;; Zero IV to store digest
+ vpxor xmm0, xmm0
+ vmovdqa [state + _aes_cmac_args_IV + tmp], xmm0
+
+ lea m_last, [state + _aes_cmac_scratch + tmp]
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CMAC)
+ mov len, [job + _msg_len_to_hash_in_bits]
+ mov rbits, len
+ add len, 7 ; inc len if there are remainder bits
+ shr len, 3
+ and rbits, 7
+
+ ;; Check number of blocks and for partial block
+ mov r, len ; set remainder
+ and r, 0xf
+
+ lea n, [len + 0xf] ; set num blocks
+ shr n, 4
+
+ jz %%_lt_one_block ; check one or more blocks?
+
+ ;; One or more blocks, potentially partial
+ mov word [state + _aes_cmac_init_done + lane*2], 0
+
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _aes_cmac_args_in + lane*8], tmp2
+
+ ;; len = (n-1)*16
+ lea tmp2, [n - 1]
+ shl tmp2, 4
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ ;; check remainder bits
+ or rbits, rbits
+ jnz %%_not_complete_block_3gpp
+
+ ;; check if complete block
+ or r, r
+ jz %%_complete_block
+
+%%_not_complete_block:
+ ;; M_last = padding(M_n) XOR K2
+ lea tmp, [rel padding_0x80_tab16 + 16]
+ sub tmp, r
+ vmovdqu xmm0, [tmp]
+ vmovdqa [m_last], xmm0
+
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ memcpy_avx_16 m_last, tmp, r, tmp4, iv
+
+ ;; src + n + r
+ mov tmp3, [job + _skey2]
+ vmovdqa xmm1, [m_last]
+ vmovdqu xmm0, [tmp3]
+ vpxor xmm0, xmm1
+ vmovdqa [m_last], xmm0
+
+%%_step_5:
+ ;; Find min length
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ vphminposuw xmm1, xmm0
+
+ cmp byte [state + _aes_cmac_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 35
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_cmac_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_cmac_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_cmac_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_cmac_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; Copy good_lane to empty lanes
+ mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
+ mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_cmac_args_in + I*8], tmp2
+ mov [state + _aes_cmac_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_cmac_args_IV + I*16], xmm2
+ vpor xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ ;; Find min length
+ vphminposuw xmm1, xmm0
+
+%endif ; end FLUSH
+
+%%_cmac_round:
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je %%_len_is_0
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm1
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and idx are intact
+
+ vmovdqa xmm0, [state + _aes_cmac_lens] ; preload lens
+%%_len_is_0:
+ ; Check if job complete
+ test word [state + _aes_cmac_init_done + idx*2], 0xffff
+ jnz %%_copy_complete_digest
+
+ ; Finish step 6
+ mov word [state + _aes_cmac_init_done + idx*2], 1
+
+ XVPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ vphminposuw xmm1, xmm0 ; find min length
+
+ mov tmp3, idx
+ shl tmp3, 4 ; idx*16
+ lea m_last, [state + _aes_cmac_scratch + tmp3]
+ mov [state + _aes_cmac_args_in + idx*8], m_last
+
+ jmp %%_cmac_round
+
+%%_copy_complete_digest:
+ ; Job complete, copy digest to AT output
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov tmp4, idx
+ shl tmp4, 4
+ lea tmp3, [state + _aes_cmac_args_IV + tmp4]
+ mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
+ mov tmp2, [job_rax + _auth_tag_output]
+
+ cmp tmp4, 16
+ jne %%_ne_16_copy
+
+ ;; 16 byte AT copy
+ vmovdqa xmm0, [tmp3]
+ vmovdqu [tmp2], xmm0
+ jmp %%_update_lanes
+
+%%_ne_16_copy:
+ memcpy_avx_16 tmp2, tmp3, tmp4, lane, iv
+
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ ;; Clear digest (in memory for IV) and scratch memory of returned job
+ vmovdqa [tmp3], xmm0
+
+ shl idx, 4
+ vmovdqa [state + _aes_cmac_scratch + idx], xmm0
+
+%else
+ ;; Clear digest and scratch memory of returned job and "NULL lanes"
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_cmac_args_IV + I*16], xmm0
+ vmovdqa [state + _aes_cmac_scratch + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif ;; SUBMIT
+
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+%%_complete_block:
+
+ ;; Block size aligned
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp2, tmp3
+
+ ;; M_last = M_n XOR K1
+ mov tmp3, [job + _skey1]
+ vmovdqu xmm0, [tmp3]
+ vmovdqu xmm1, [tmp2]
+ vpxor xmm0, xmm1
+ vmovdqa [m_last], xmm0
+
+ jmp %%_step_5
+
+%%_lt_one_block:
+ ;; Single partial block
+ mov word [state + _aes_cmac_init_done + lane*2], 1
+ mov [state + _aes_cmac_args_in + lane*8], m_last
+
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ XVPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ mov n, 1
+ jmp %%_not_complete_block
+
+%%_not_complete_block_3gpp:
+ ;; bit pad last block
+ ;; xor with skey2
+ ;; copy to m_last
+
+ ;; load pointer to src
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ ;; check if partial block
+ or r, r
+ jz %%_load_full_block_3gpp
+
+ simd_load_avx_15_1 xmm0, tmp, r
+ dec r
+
+%%_update_mlast_3gpp:
+ ;; set last byte padding mask
+ ;; shift into correct xmm idx
+
+ ;; save and restore rcx on windows
+%ifndef LINUX
+ mov tmp, rcx
+%endif
+ mov rcx, rbits
+ mov tmp3, 0xff
+ shr tmp3, cl
+ movq xmm2, tmp3
+ XVPSLLB xmm2, r, xmm1, tmp2
+
+ ;; pad final byte
+ vpandn xmm2, xmm0
+%ifndef LINUX
+ mov rcx, tmp
+%endif
+ ;; set OR mask to pad final bit
+ mov tmp2, tmp3
+ shr tmp2, 1
+ xor tmp2, tmp3 ; XOR to get OR mask
+ movq xmm3, tmp2
+ ;; xmm1 contains shift table from previous shift
+ vpshufb xmm3, xmm1
+
+ ;; load skey2 address
+ mov tmp3, [job + _skey2]
+ vmovdqu xmm1, [tmp3]
+
+ ;; set final padding bit
+ vpor xmm2, xmm3
+
+ ;; XOR last partial block with skey2
+ ;; update mlast
+ vpxor xmm2, xmm1
+ vmovdqa [m_last], xmm2
+
+ jmp %%_step_5
+
+%%_load_full_block_3gpp:
+ vmovdqu xmm0, [tmp]
+ mov r, 0xf
+ jmp %%_update_mlast_3gpp
+%endif
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
+SUBMIT_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
+FLUSH_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm
new file mode 100644
index 000000000..dbd2a4547
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm
@@ -0,0 +1,239 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X8
+%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_avx
+%endif
+
+; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X8
+
+section .data
+default rel
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define good_lane rdx
+%define iv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp rbp
+%define idx rbp
+
+%define tmp3 r8
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; copy good_lane to empty lanes
+ mov tmp1, [state + _aes_args_in + good_lane*8]
+ mov tmp2, [state + _aes_args_out + good_lane*8]
+ mov tmp3, [state + _aes_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_args_IV + good_lane]
+ vmovdqa xmm0, [state + _aes_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_args_in + I*8], tmp1
+ mov [state + _aes_args_out + I*8], tmp2
+ mov [state + _aes_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_args_IV + I*16], xmm2
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+ ;; Clear IVs of returned job and "NULL lanes"
+ vpxor xmm0, xmm0
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_args_IV + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm
new file mode 100644
index 000000000..c95fa1f6c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm
@@ -0,0 +1,194 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+%ifndef AES_CBC_ENC_X8
+%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_avx
+%endif
+
+; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X8
+
+section .data
+default rel
+
+align 16
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov len, [job + _msg_len_to_cipher_in_bytes]
+ and len, -16 ; DOCSIS may pass size unaligned to block size
+ mov iv, [job + _iv]
+ mov [state + _aes_unused_lanes], unused_lanes
+
+ mov [state + _aes_job_in_lane + lane*8], job
+
+ vmovdqa xmm0, [state + _aes_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16
+ vmovdqa [state + _aes_lens], xmm0
+
+ mov tmp, [job + _src]
+ add tmp, [job + _cipher_start_src_offset_in_bytes]
+ vmovdqu xmm0, [iv]
+ mov [state + _aes_args_in + lane*8], tmp
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_args_keys + lane*8], tmp
+ mov tmp, [job + _dst]
+ mov [state + _aes_args_out + lane*8], tmp
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + lane], xmm0
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+ ; Find min length
+ vmovdqa xmm0, [state + _aes_lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+ ;; Clear IV
+ vpxor xmm0, xmm0
+ shl idx, 3 ; multiply by 8
+ vmovdqa [state + _aes_args_IV + idx*2], xmm0
+ mov qword [state + _aes_args_keys + idx], 0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm
new file mode 100644
index 000000000..a810842a9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm
@@ -0,0 +1,264 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_XCBC_X8
+%define AES_XCBC_X8 aes_xcbc_mac_128_x8
+%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+%endif
+
+; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X8
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define icv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp r10
+%define idx rbp
+
+%define tmp3 r8
+%define lane_data r9
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal)
+FLUSH_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel three]
+ cmp qword [state + _aes_xcbc_ldata + 4 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel four]
+ cmp qword [state + _aes_xcbc_ldata + 5 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel five]
+ cmp qword [state + _aes_xcbc_ldata + 6 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel six]
+ cmp qword [state + _aes_xcbc_ldata + 7 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel seven]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ mov tmp1, [state + _aes_xcbc_args_in + idx*8]
+ mov tmp3, [state + _aes_xcbc_args_keys + idx*8]
+ shl idx, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_xcbc_args_ICV + idx]
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_xcbc_args_in + I*8], tmp1
+ mov [state + _aes_xcbc_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm2
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+ mov word [state + _aes_xcbc_lens + 2*idx], 16
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp copy_lane_data
+
+end_loop:
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ vmovq [icv], xmm0
+ vpextrd [icv + 8], xmm0, 2
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+ ;; Clear ICV's and final blocks in returned job and NULL lanes
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm0
+ lea lane_data, [state + _aes_xcbc_ldata + (I * _XCBC_LANE_DATA_size)]
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ vmovdqa [lane_data + _xcbc_final_block + 16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm
new file mode 100644
index 000000000..38f6a6470
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm
@@ -0,0 +1,272 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+%ifndef AES_XCBC_X8
+%define AES_XCBC_X8 aes_xcbc_mac_128_x8
+%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+%endif
+
+; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X8
+
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len r11
+%define idx rbp
+%define tmp2 rbp
+%define tmp r14
+
+%define lane r8
+%define icv r9
+%define p2 r9
+
+%define last_len r10
+
+%define lane_data r12
+%define p r13
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
+SUBMIT_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+ mov [lane_data + _xcbc_job_in_lane], job
+ mov dword [lane_data + _xcbc_final_done], 0
+ mov tmp, [job + _k1_expanded]
+ mov [state + _aes_xcbc_args_keys + lane*8], tmp
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+
+ mov last_len, len
+
+ cmp len, 16
+ jle small_buffer
+
+ mov [state + _aes_xcbc_args_in + lane*8], p
+ add p, len ; set point to end of data
+
+ and last_len, 15 ; Check lsbs of msg len
+ jnz slow_copy ; if not 16B mult, do slow copy
+
+fast_copy:
+ vmovdqu xmm0, [p - 16] ; load last block M[n]
+ mov tmp, [job + _k2] ; load K2 address
+ vmovdqu xmm1, [tmp] ; load K2
+ vpxor xmm0, xmm0, xmm1 ; M[n] XOR K2
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ sub len, 16 ; take last block off length
+end_fast_copy:
+ vpxor xmm0, xmm0, xmm0
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_xcbc_args_ICV + lane], xmm0
+
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, len, no_scale
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, 16, scale_x16
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp start_loop
+
+end_loop:
+ ; process completed job "idx"
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ vmovq [icv], xmm0
+ vpextrd [icv + 8], xmm0, 2
+
+%ifdef SAFE_DATA
+ ;; Clear ICV
+ vpxor xmm0, xmm0
+ vmovdqa [state + _aes_xcbc_args_ICV + idx], xmm0
+
+ ;; Clear final block (32 bytes)
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ vmovdqa [lane_data + _xcbc_final_block + 16], xmm0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+small_buffer:
+ ; For buffers <= 16 Bytes
+ ; The input data is set to final block
+ lea tmp, [lane_data + _xcbc_final_block] ; final block
+ mov [state + _aes_xcbc_args_in + lane*8], tmp
+ add p, len ; set point to end of data
+ cmp len, 16
+ je fast_copy
+
+slow_copy:
+ and len, ~15 ; take final block off len
+ sub p, last_len ; adjust data pointer
+ lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
+ sub p2, last_len ; adjust data pointer backwards
+ memcpy_avx_16_1 p2, p, last_len, tmp, tmp2
+ vmovdqa xmm0, [rel x80] ; fill reg with padding
+ vmovdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding
+ vmovdqu xmm0, [p2] ; load final block to process
+ mov tmp, [job + _k3] ; load K3 address
+ vmovdqu xmm1, [tmp] ; load K3
+ vpxor xmm0, xmm0, xmm1 ; M[n] XOR K3
+ vmovdqu [lane_data + _xcbc_final_block], xmm0 ; write final block
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c
new file mode 100644
index 000000000..29cf2a308
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c
@@ -0,0 +1,733 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx
+
+#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx
+#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_avx
+#define AES_CBC_DEC_192 aes_cbc_dec_192_avx
+#define AES_CBC_DEC_256 aes_cbc_dec_256_avx
+
+#define AES_CNTR_128 aes_cntr_128_avx
+#define AES_CNTR_192 aes_cntr_192_avx
+#define AES_CNTR_256 aes_cntr_256_avx
+
+#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+
+#define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+#define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+#define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+#define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+#define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+#define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+
+#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx
+#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen2
+#define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen2
+#define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen2
+#define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen2
+#define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen2
+#define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen2
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx
+#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_dec_avx
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx
+#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_enc_avx
+#endif
+
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define QUEUE_SIZE queue_size_avx
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX
+#define FLUSH_JOB_AES_DEC FLUSH_JOB_AES_DEC_AVX
+
+
+
+JOB_AES_HMAC *submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state);
+
+#define SUBMIT_JOB_HMAC submit_job_hmac_avx
+#define FLUSH_JOB_HMAC flush_job_hmac_avx
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_avx
+#define FLUSH_JOB flush_job_avx
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx
+#define GET_NEXT_JOB get_next_job_avx
+#define GET_COMPLETED_JOB get_completed_job_avx
+
+/* ====================================================================== */
+
+
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_avx
+
+void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x8
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for AVX arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+submit_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+
+static JOB_AES_HMAC *
+submit_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_avx(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_128_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_192_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+ else /* assume 32 bytes */
+ aes_cntr_bit_256_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+void
+init_mb_mgr_avx(MB_MGR *state)
+{
+ unsigned int j;
+ uint8_t *p;
+ size_t size;
+
+ state->features = cpu_feature_adjust(state->flags,
+ cpu_feature_detect());
+
+ if (!(state->features & IMB_FEATURE_AESNI)) {
+ init_mb_mgr_sse_no_aesni(state);
+ return;
+ }
+
+ /* Init AES out-of-order fields */
+ memset(state->aes128_ooo.lens, 0xFF,
+ sizeof(state->aes128_ooo.lens));
+ memset(&state->aes128_ooo.lens[0], 0,
+ sizeof(state->aes128_ooo.lens[0]) * 8);
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xF76543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0xFF,
+ sizeof(state->aes192_ooo.lens));
+ memset(&state->aes192_ooo.lens[0], 0,
+ sizeof(state->aes192_ooo.lens[0]) * 8);
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xF76543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(&state->aes256_ooo.lens, 0xFF,
+ sizeof(state->aes256_ooo.lens));
+ memset(&state->aes256_ooo.lens[0], 0,
+ sizeof(state->aes256_ooo.lens[0]) * 8);
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xF76543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+
+ /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block)
+ * uses same settings as AES128 CBC.
+ */
+ memset(state->docsis_sec_ooo.lens, 0xFF,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(&state->docsis_sec_ooo.lens[0], 0,
+ sizeof(state->docsis_sec_ooo.lens[0]) * 8);
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xF76543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5*4] = 0x80;
+ p[64-2] = 0x02;
+ p[64-1] = 0xA0;
+ }
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[7 * 4] = 0x80; /* digest 7 words long */
+ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64 - 1] = 0xE0;
+ }
+
+ /* Init HMAC/SHA256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[8 * 4] = 0x80; /* 8 digest words */
+ p[64 - 2] = 0x03; /* length */
+ p[64 - 1] = 0x00;
+ }
+
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_384_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < AVX_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 384 bits, 1408 bits == 0x0580.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_512_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < AVX_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /*
+ * hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 512 bits, 1536 bits == 0x600.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0xFFFF;
+ state->hmac_md5_ooo.lens[9] = 0xFFFF;
+ state->hmac_md5_ooo.lens[10] = 0xFFFF;
+ state->hmac_md5_ooo.lens[11] = 0xFFFF;
+ state->hmac_md5_ooo.lens[12] = 0xFFFF;
+ state->hmac_md5_ooo.lens[13] = 0xFFFF;
+ state->hmac_md5_ooo.lens[14] = 0xFFFF;
+ state->hmac_md5_ooo.lens[15] = 0xFFFF;
+ state->hmac_md5_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_md5_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[4 * 4] = 0x80;
+ p[64 - 7] = 0x02;
+ p[64 - 8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0;
+ state->aes_xcbc_ooo.lens[5] = 0;
+ state->aes_xcbc_ooo.lens[6] = 0;
+ state->aes_xcbc_ooo.lens[7] = 0;
+ state->aes_xcbc_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < 8; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF76543210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF76543210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set AVX handlers */
+ state->get_next_job = get_next_job_avx;
+ state->submit_job = submit_job_avx;
+ state->submit_job_nocheck = submit_job_nocheck_avx;
+ state->get_completed_job = get_completed_job_avx;
+ state->flush_job = flush_job_avx;
+ state->queue_size = queue_size_avx;
+ state->keyexp_128 = aes_keyexp_128_avx;
+ state->keyexp_192 = aes_keyexp_192_avx;
+ state->keyexp_256 = aes_keyexp_256_avx;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx;
+ state->xcbc_keyexp = aes_xcbc_expand_key_avx;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_avx;
+ state->sha1 = sha1_avx;
+ state->sha224_one_block = sha224_one_block_avx;
+ state->sha224 = sha224_avx;
+ state->sha256_one_block = sha256_one_block_avx;
+ state->sha256 = sha256_avx;
+ state->sha384_one_block = sha384_one_block_avx;
+ state->sha384 = sha384_avx;
+ state->sha512_one_block = sha512_one_block_avx;
+ state->sha512 = sha512_avx;
+ state->md5_one_block = md5_one_block_avx;
+ state->aes128_cfb_one = aes_cfb_128_one_avx;
+
+ state->eea3_1_buffer = zuc_eea3_1_buffer_avx;
+ state->eea3_4_buffer = zuc_eea3_4_buffer_avx;
+ state->eea3_n_buffer = zuc_eea3_n_buffer_avx;
+ state->eia3_1_buffer = zuc_eia3_1_buffer_avx;
+
+ state->f8_1_buffer = kasumi_f8_1_buffer_avx;
+ state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx;
+ state->f8_2_buffer = kasumi_f8_2_buffer_avx;
+ state->f8_3_buffer = kasumi_f8_3_buffer_avx;
+ state->f8_4_buffer = kasumi_f8_4_buffer_avx;
+ state->f8_n_buffer = kasumi_f8_n_buffer_avx;
+ state->f9_1_buffer = kasumi_f9_1_buffer_avx;
+ state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx;
+ state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx;
+ state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx;
+ state->kasumi_key_sched_size = kasumi_key_sched_size_avx;
+
+ state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx;
+ state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx;
+ state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx;
+ state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx;
+ state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx;
+ state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx;
+ state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx;
+ state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx;
+ state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx;
+ state->snow3g_init_key_sched = snow3g_init_key_sched_avx;
+ state->snow3g_key_sched_size = snow3g_key_sched_size_avx;
+
+#ifndef NO_GCM
+ state->gcm128_enc = aes_gcm_enc_128_avx_gen2;
+ state->gcm192_enc = aes_gcm_enc_192_avx_gen2;
+ state->gcm256_enc = aes_gcm_enc_256_avx_gen2;
+ state->gcm128_dec = aes_gcm_dec_128_avx_gen2;
+ state->gcm192_dec = aes_gcm_dec_192_avx_gen2;
+ state->gcm256_dec = aes_gcm_dec_256_avx_gen2;
+ state->gcm128_init = aes_gcm_init_128_avx_gen2;
+ state->gcm192_init = aes_gcm_init_192_avx_gen2;
+ state->gcm256_init = aes_gcm_init_256_avx_gen2;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_avx_gen2;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_avx_gen2;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_avx_gen2;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_avx_gen2;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_avx_gen2;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_avx_gen2;
+ state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx_gen2;
+ state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx_gen2;
+ state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx_gen2;
+ state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx_gen2;
+ state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx_gen2;
+ state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx_gen2;
+ state->gcm128_precomp = aes_gcm_precomp_128_avx_gen2;
+ state->gcm192_precomp = aes_gcm_precomp_192_avx_gen2;
+ state->gcm256_precomp = aes_gcm_precomp_256_avx_gen2;
+ state->gcm128_pre = aes_gcm_pre_128_avx_gen2;
+ state->gcm192_pre = aes_gcm_pre_192_avx_gen2;
+ state->gcm256_pre = aes_gcm_pre_256_avx_gen2;
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm
new file mode 100644
index 000000000..750a630aa
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm
@@ -0,0 +1,298 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha1_mult_avx
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_avx,function,internal)
+flush_job_hmac_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_avx
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ ;; idx determines which column
+ ;; read off from consecutive rows
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp2)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp4)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm
new file mode 100644
index 000000000..a53ad0843
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm
@@ -0,0 +1,321 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern md5_x4x2_avx
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+; unused_lanes must be in rax-rdx
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define tmp5 r9
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_md5_avx,function,internal)
+flush_job_hmac_md5_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel three]
+ cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel four]
+ cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel five]
+ cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel six]
+ cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel seven]
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_md5]
+ mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_md5], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_md5 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_md5]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+; bswap DWORD(tmp2)
+; bswap DWORD(tmp4)
+; bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp5)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp5)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (16 bytes)
+%assign J 0
+%rep 4
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+
+ lea lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm
new file mode 100644
index 000000000..5e4627dca
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm
@@ -0,0 +1,355 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/memcpy.asm"
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+extern md5_x4x2_avx
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_md5_avx,function,internal)
+submit_job_hmac_md5_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov [state + _unused_lanes_md5], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ ;; insert len into proper lane
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+; bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_md5]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+; vpshufb xmm0, [byteswap wrt rip]
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exiting
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_md5]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp3)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm
new file mode 100644
index 000000000..416dfb869
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_224_avx
+%define SHA224
+
+%include "avx/mb_mgr_hmac_sha_256_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm
new file mode 100644
index 000000000..ad0721cd7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_224_avx
+%define SHA224
+
+%include "avx/mb_mgr_hmac_sha_256_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm
new file mode 100644
index 000000000..0d8b8e50e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm
@@ -0,0 +1,356 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha_256_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_256_avx
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 3
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ vmovdqa xmm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + 8*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha256], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy 14 bytes for SHA224 / 16 bytes for SHA256
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp6)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp6)
+ mov [p + 3*4], DWORD(tmp5)
+
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+%ifndef SHA224
+ bswap DWORD(tmp5)
+%endif
+ mov [p + 4*4], DWORD(tmp2)
+ mov [p + 5*4], DWORD(tmp4)
+ mov [p + 6*4], DWORD(tmp6)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%ifdef SHA224
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm
new file mode 100644
index 000000000..738d88b94
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm
@@ -0,0 +1,428 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha_256_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_256_avx
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*3], rsi
+ mov [rsp + _gpr_save + 8*4], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + 8*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha256]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exit
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+ ; copy 14 bytes for SHA224 / 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%ifdef SHA224
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*3]
+ mov rdi, [rsp + _gpr_save + 8*4]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm
new file mode 100644
index 000000000..f3491ab27
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_384_avx
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx/mb_mgr_hmac_sha_512_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm
new file mode 100644
index 000000000..a2fb0f1c6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_384_avx
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx/mb_mgr_hmac_sha_512_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm
new file mode 100644
index 000000000..2de170948
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm
@@ -0,0 +1,339 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x2_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+one: dq 1
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_512_avx
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel one]
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha512], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0xA0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+ ; move digest into data location
+ %assign I 0
+ %rep (SHA_X_DIGEST_SIZE / (8*16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0
+ %assign I (I+1)
+ %endrep
+
+ ; move the opad key into digest
+ mov tmp, [job + _auth_key_xor_opad]
+
+ %assign I 0
+ %rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+ mov [p + 3*8], QWORD(tmp5)
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 4*8], QWORD(tmp2)
+ mov [p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp6)
+ mov [p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+ vmovdqa [lane_data + _outer_block + 48], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm
new file mode 100644
index 000000000..b37884d0f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm
@@ -0,0 +1,416 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha512_x2_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_512_avx
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+%assign I 0
+%rep 2
+ vmovdqu xmm0, [p - 128 + I*4*16 + 0*16]
+ vmovdqu xmm1, [p - 128 + I*4*16 + 1*16]
+ vmovdqu xmm2, [p - 128 + I*4*16 + 2*16]
+ vmovdqu xmm3, [p - 128 + I*4*16 + 3*16]
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3
+%assign I (I+1)
+%endrep
+
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 2 * 8]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha512]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...1)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0xA0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 16
+copy_lt128:
+ ;; less than one message block of data
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_avx_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes and SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes and SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+ vmovdqa [lane_data + _outer_block + 48], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm
new file mode 100644
index 000000000..418f0bc43
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm
@@ -0,0 +1,358 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha1_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rdi, rsi, rbx, rbp
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_avx,function,internal)
+submit_job_hmac_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm
new file mode 100644
index 000000000..1aa2c2600
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm
@@ -0,0 +1,716 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute octal MD5 using AVX
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+extern MD5_TABLE
+
+section .data
+default rel
+align 64
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+
+section .text
+
+%ifdef LINUX
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define mem1 rcx
+%define mem2 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define mem1 rdi
+%define mem2 rsi
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+%define TBL rax
+%define IDX rbx
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+; stack size must be an odd multiple of 8 bytes in size
+struc STACK
+_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: reso 8 ; stores AA-DD, AA2-DD2
+ resb 8 ; for alignment
+endstruc
+%define STACK_SIZE STACK_size
+
+%define AA rsp + _DIGEST + 16*0
+%define BB rsp + _DIGEST + 16*1
+%define CC rsp + _DIGEST + 16*2
+%define DD rsp + _DIGEST + 16*3
+%define AA2 rsp + _DIGEST + 16*4
+%define BB2 rsp + _DIGEST + 16*5
+%define CC2 rsp + _DIGEST + 16*6
+%define DD2 rsp + _DIGEST + 16*7
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z,[rel ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, [%%data]
+ vpaddd %%A2, %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ vpaddd %%A, %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ vpaddd %%A2, %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vmovdqa %%TMP,[%%data]
+ vmovdqa %%TMP2,[%%data + 16*16]
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, %%TMP
+ vpaddd %%A2, %%A2, %%TMP2
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ vpaddd %%A, %%A, %%FUN
+ vpaddd %%A2, %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+;
+align 32
+MKGLOBAL(md5_x4x2_avx,function,internal)
+md5_x4x2_avx:
+
+ sub rsp, STACK_SIZE
+
+ ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
+ ;; Initialize digests
+ vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
+ vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
+ vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
+ vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
+ vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
+ vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
+ vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ lea TBL, [rel MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[state+_data_ptr_md5 +0*PTR_SZ]
+ mov inp1,[state+_data_ptr_md5 +1*PTR_SZ]
+ mov inp2,[state+_data_ptr_md5 +2*PTR_SZ]
+ mov inp3,[state+_data_ptr_md5 +3*PTR_SZ]
+ mov inp4,[state+_data_ptr_md5 +4*PTR_SZ]
+ mov inp5,[state+_data_ptr_md5 +5*PTR_SZ]
+ mov inp6,[state+_data_ptr_md5 +6*PTR_SZ]
+ mov inp7,[state+_data_ptr_md5 +7*PTR_SZ]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16],T0
+ vmovdqa [mem1+(I*4+1)*16],T1
+ vmovdqa [mem1+(I*4+2)*16],T2
+ vmovdqa [mem1+(I*4+3)*16],T3
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ vmovdqa [AA], A
+ vmovdqa [BB], B
+ vmovdqa [CC], C
+ vmovdqa [DD], D
+ ; save old digests
+ vmovdqa [AA2], A2
+ vmovdqa [BB2], B2
+ vmovdqa [CC2], C2
+ vmovdqa [DD2], D2
+
+ add IDX, 4*16
+ sub num_blks, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; write out digests
+ vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A
+ vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B
+ vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C
+ vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D
+ vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
+ vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
+ vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
+ vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0
+ mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1
+ mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2
+ mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3
+ mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4
+ mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5
+ mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6
+ mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7
+
+ ;; Clear stack frame (72*16 bytes)
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (2*2*16+8)
+ vmovdqa [rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/pon_avx.asm b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
new file mode 100644
index 000000000..8510dc4a3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
@@ -0,0 +1,1170 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%use smartalign
+
+%include "job_aes_hmac.asm"
+%include "include/os.asm"
+%include "include/memcpy.asm"
+
+;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP
+;;; This combination is required by PON/xPON/gPON standard.
+;;; Note: BIP is running XOR of double words
+;;; Order of operations:
+;;; - encrypt: HEC update (XGEM header), CRC32 (Ethernet FCS), AES-CTR and BIP
+;;; - decrypt: BIP, AES-CTR and CRC32 (Ethernet FCS)
+
+extern byteswap_const
+extern ddq_add_1
+
+section .data
+default rel
+
+;;; Precomputed constants for CRC32 (Ethernet FCS)
+;;; Details of the CRC algorithm and 4 byte buffer of
+;;; {0x01, 0x02, 0x03, 0x04}:
+;;; Result Poly Init RefIn RefOut XorOut
+;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF
+align 16
+rk1:
+ dq 0x00000000ccaa009e, 0x00000001751997d0
+
+align 16
+rk5:
+ dq 0x00000000ccaa009e, 0x0000000163cd6124
+
+align 16
+rk7:
+ dq 0x00000001f7011640, 0x00000001db710640
+
+align 16
+pshufb_shf_table:
+ ;; use these values for shift registers with the pshufb instruction
+ dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+ dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+align 16
+init_crc_value:
+ dq 0x00000000FFFFFFFF, 0x0000000000000000
+
+align 16
+mask:
+ dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+
+align 16
+mask2:
+ dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+align 16
+mask3:
+ dq 0x8080808080808080, 0x8080808080808080
+
+align 16
+mask_out_top_bytes:
+ dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+ dq 0x0000000000000000, 0x0000000000000000
+
+align 16
+ddq_add_1_1:
+ dq 0x1, 0x1
+
+;; Precomputed constants for HEC calculation (XGEM header)
+;; POLY 0x53900000:
+;; k1 = 0xf9800000
+;; k2 = 0xa0900000
+;; k3 = 0x7cc00000
+;; q = 0x46b927ec
+;; p_res = 0x53900000
+
+align 16
+k3_q:
+ dq 0x7cc00000, 0x46b927ec
+
+align 16
+p_res:
+ dq 0x53900000, 0
+
+align 16
+mask_out_top_64bits:
+ dq 0xffffffff_ffffffff, 0
+
+section .text
+
+%define NUM_AES_ROUNDS 10
+
+%define xcounter xmm0
+%define xbip xmm1
+%define xcrc xmm2
+%define xcrckey xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+%define xtmp4 xmm7
+%define xtmp5 xmm8
+%define xtmp6 xmm9
+%define xtmp7 xmm10
+%define xtmp8 xmm11
+%define xtmp9 xmm12
+%define xtmp10 xmm13
+%define xtmp11 xmm14
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define tmp_1 r8
+%define tmp_2 r9
+%define tmp_3 r10
+%define tmp_4 r11
+%define tmp_5 r12
+%define tmp_6 r13
+%define tmp_7 r14
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define tmp_1 r10
+%define tmp_2 r11
+%define tmp_3 rax
+%define tmp_4 r12
+%define tmp_5 r13
+%define tmp_6 r14
+%define tmp_7 r15
+%endif
+
+%define job arg1
+
+%define p_in arg2
+%define p_keys arg3
+%define p_out arg4
+
+%define num_bytes tmp_1 ; bytes to cipher
+%define tmp tmp_2
+%define ctr_check tmp_3 ; counter block overflow check
+%define bytes_to_crc tmp_4 ; number of bytes to crc ( < num_bytes)
+
+%define ethernet_fcs tmp_6 ; not used together with tmp3
+%define tmp2 tmp_5
+%define tmp3 tmp_6
+
+%define write_back_crc tmp_7
+%define decrypt_not_done tmp_7
+
+;;; ============================================================================
+;;; Does all AES encryption rounds
+%macro AES_ENC_ROUNDS 3
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK %3 ; [in/out] XMM with encrypted block
+
+%assign round 0
+ vpxor %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vaesenc %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+%endrep
+
+%assign round (round + 1)
+ vaesenclast %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%endmacro
+
+;;; ============================================================================
+;;; Does all AES encryption rounds on 4 blocks
+%macro AES_ENC_ROUNDS_4 7
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK1 %3 ; [in/out] XMM with encrypted block
+%define %%BLOCK2 %4 ; [in/out] XMM with encrypted block
+%define %%BLOCK3 %5 ; [in/out] XMM with encrypted block
+%define %%BLOCK4 %6 ; [in/out] XMM with encrypted block
+%define %%XT1 %7 ; [clobbered] temporary XMM register
+
+%assign round 0
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vpxor %%BLOCK1, %%BLOCK1, %%XT1
+ vpxor %%BLOCK2, %%BLOCK2, %%XT1
+ vpxor %%BLOCK3, %%BLOCK3, %%XT1
+ vpxor %%BLOCK4, %%BLOCK4, %%XT1
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenc %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenc %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenc %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenc %%BLOCK4, %%BLOCK4, %%XT1
+%endrep
+
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenclast %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenclast %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenclast %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenclast %%BLOCK4, %%BLOCK4, %%XT1
+%endmacro
+
+;;; ============================================================================
+;;; CRC multiply before XOR against data block
+%macro CRC_CLMUL 3
+%define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%XTMP %3 ; [clobbered] temporary XMM
+
+ vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XTMP
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON 15
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store)
+%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load)
+%define %%TXMM2 %11 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %13 ; [in] "ENC" or "DEC"
+%define %%CIPH %14 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vpshufb %%TXMM0, %%CTR, [rel byteswap_const]
+ ;; perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1]
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1_1]
+%%_ctr_overflow_done:
+%endif
+
+ ;; CRC calculation
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%TXMM2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+%ifnidn %%INP, no_load
+ vmovdqu %%TXMM1, [%%INP]
+%endif
+
+%ifidn %%CIPH, CTR
+ ;; AES rounds
+ AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%TXMM0, %%TXMM0, %%TXMM1
+%else ;; CIPH = NO_CTR
+ ;; register copy is needed as no_load/no_store options need it
+ vmovdqa %%TXMM0, %%TXMM1
+%endif ;; CIPH = CTR
+
+%ifnidn %%CRC_TYPE, no_crc
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM2
+%endif
+%ifidn %%CIPH, CTR
+ ;; CRC calculation for ENCRYPTION/DECRYPTION
+ ;; - always XOR against plaintext block
+%ifidn %%DIR, ENC
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%else
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM0
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; CRC_TYPE != NO_CRC
+
+ ;; store the result in the output buffer
+%ifnidn %%OUTP, no_store
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%TXMM0
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXMM1
+%endif ;; CIPH = CTR
+%endif
+
+ ;; update BIP value - always use cipher text for BIP
+%ifnidn %%XBIP_IN_OUT, no_bip
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM0
+%else
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; !NO_BIP
+
+ ;; increment in/out pointers
+%ifnidn %%INP, no_load
+ add %%INP, 16
+%endif
+%ifnidn %%OUTP, no_store
+ add %%OUTP, 16
+%endif
+%endmacro ; DO_PON
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON_4 23
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%T0 %9 ; [clobbered] XMM temporary
+%define %%T1 %10 ; [clobbered] XMM temporary
+%define %%T2 %11 ; [clobbered] XMM temporary
+%define %%T3 %12 ; [clobbered] XMM temporary
+%define %%T4 %13 ; [clobbered] XMM temporary
+%define %%T5 %14 ; [clobbered] XMM temporary
+%define %%T6 %15 ; [clobbered] XMM temporary
+%define %%T7 %16 ; [clobbered] XMM temporary
+%define %%T8 %17 ; [clobbered] XMM temporary
+%define %%T9 %18 ; [clobbered] XMM temporary
+%define %%T10 %19 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %20 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %21 ; [in] "ENC" or "DEC"
+%define %%CIPH %22 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %23 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%define %%CTR1 %%T3
+%define %%CTR2 %%T4
+%define %%CTR3 %%T5
+%define %%CTR4 %%T6
+
+%define %%TXT1 %%T7
+%define %%TXT2 %%T8
+%define %%TXT3 %%T9
+%define %%TXT4 %%T10
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vmovdqa %%T0, [rel ddq_add_1]
+ vmovdqa %%T2, [rel byteswap_const]
+
+ ;; CTR1: copy saved CTR value as CTR1
+ vmovdqa %%CTR1, %%CTR
+
+ cmp %%CTR_CHECK, 0xffff_ffff_ffff_ffff - 4
+ ja %%_ctr_will_overflow
+
+ ;; case in which 64-bit counter will not overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ vpaddq %%CTR3, %%CTR2, %%T0
+ vpaddq %%CTR4, %%CTR3, %%T0
+ vpaddq %%CTR, %%CTR4, %%T0
+ vpshufb %%CTR1, %%CTR1, %%T2
+ vpshufb %%CTR2, %%CTR2, %%T2
+ vpshufb %%CTR3, %%CTR3, %%T2
+ vpshufb %%CTR4, %%CTR4, %%T2
+ add %%CTR_CHECK, 4
+ jmp %%_ctr_update_done
+
+%%_ctr_will_overflow:
+ vmovdqa %%T1, [rel ddq_add_1_1]
+ ;; CTR2: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr2_overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ jmp %%_ctr2_overflow_done
+%%_ctr2_overflow:
+ vpaddq %%CTR2, %%CTR1, %%T1
+%%_ctr2_overflow_done:
+ vpshufb %%CTR1, %%CTR1, %%T2
+
+ ;; CTR3: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr3_overflow
+ vpaddq %%CTR3, %%CTR2, %%T0
+ jmp %%_ctr3_overflow_done
+%%_ctr3_overflow:
+ vpaddq %%CTR3, %%CTR2, %%T1
+%%_ctr3_overflow_done:
+ vpshufb %%CTR2, %%CTR2, %%T2
+
+ ;; CTR4: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr4_overflow
+ vpaddq %%CTR4, %%CTR3, %%T0
+ jmp %%_ctr4_overflow_done
+%%_ctr4_overflow:
+ vpaddq %%CTR4, %%CTR3, %%T1
+%%_ctr4_overflow_done:
+ vpshufb %%CTR3, %%CTR3, %%T2
+
+ ;; CTR: perform 1 increment on whole 128 bits (for the next iteration)
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR4, %%T0
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR4, %%T1
+%%_ctr_overflow_done:
+ vpshufb %%CTR4, %%CTR4, %%T2
+%%_ctr_update_done:
+%endif
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%T2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+ ;; load plaintext/ciphertext
+ vmovdqu %%TXT1, [%%INP]
+ vmovdqu %%TXT2, [%%INP + 16]
+ vmovdqu %%TXT3, [%%INP + 32]
+ vmovdqu %%TXT4, [%%INP + 48]
+
+%ifidn %%CIPH, CTR
+ AES_ENC_ROUNDS_4 %%KP, %%N_ROUNDS, %%CTR1, %%CTR2, %%CTR3, %%CTR4, %%T0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%CTR1, %%CTR1, %%TXT1
+ vpxor %%CTR2, %%CTR2, %%TXT2
+ vpxor %%CTR3, %%CTR3, %%TXT3
+ vpxor %%CTR4, %%CTR4, %%TXT4
+%endif ;; CIPH = CTR
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%T2
+%endif
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%else
+ ;; CRC calculation for DECRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ;; CIPH = CTR
+
+ ;; store ciphertext/plaintext
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%CTR1
+ vmovdqu [%%OUTP + 16], %%CTR2
+ vmovdqu [%%OUTP + 32], %%CTR3
+ vmovdqu [%%OUTP + 48], %%CTR4
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXT1
+ vmovdqu [%%OUTP + 16], %%TXT2
+ vmovdqu [%%OUTP + 32], %%TXT3
+ vmovdqu [%%OUTP + 48], %%TXT4
+%endif ;; CIPH = CTR
+
+ ;; update BIP value
+%ifidn %%CIPH, CTR
+ ;; - always use ciphertext for BIP
+%ifidn %%DIR, ENC
+ vpxor %%T0, %%CTR1, %%CTR2
+ vpxor %%T1, %%CTR3, %%CTR4
+%else
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ;; CIPH = CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T0
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T1
+
+ ;; increment in/out pointers
+ add %%INP, 64
+ add %%OUTP, 64
+
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%else
+ ;; CRC calculation for DECRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%endif ;; CIPH = CTR
+
+%endmacro ; DO_PON_4
+
+;;; ============================================================================
+;;; CIPHER and BIP specified number of bytes
+%macro CIPHER_BIP_REST 14
+%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher
+%define %%DIR %2 ; [in] "ENC" or "DEC"
+%define %%CIPH %3 ; [in] "CTR" or "NO_CTR"
+%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer
+%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer
+%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys
+%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state
+%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block
+%define %%XMMT1 %9 ; [clobbered] temporary XMM
+%define %%XMMT2 %10 ; [clobbered] temporary XMM
+%define %%XMMT3 %11 ; [clobbered] temporary XMM
+%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow)
+%define %%GPT1 %13 ; [clobbered] temporary GP
+%define %%GPT2 %14 ; [clobbered] temporary GP
+
+ align 16
+%%_cipher_last_blocks:
+ cmp %%NUM_BYTES, 16
+ jb %%_partial_block_left
+
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+ sub %%NUM_BYTES, 16
+ jz %%_bip_done
+ jmp %%_cipher_last_blocks
+
+%%_partial_block_left:
+ simd_load_avx_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES
+
+ ;; DO_PON() is not loading nor storing the data in this case:
+ ;; XMMT2 = data in
+ ;; XMMT1 = data out
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+
+ ;; bip update for partial block (mask out bytes outside the message)
+ lea %%GPT1, [rel mask_out_top_bytes + 16]
+ sub %%GPT1, %%NUM_BYTES
+ vmovdqu %%XMMT3, [%%GPT1]
+ ;; put masked cipher text into XMMT2 for BIP update
+%ifidn %%DIR, ENC
+ vpand %%XMMT2, %%XMMT1, %%XMMT3
+%else
+ vpand %%XMMT2, %%XMMT2, %%XMMT3
+%endif
+ vpxor %%XBIP_IN_OUT, %%XMMT2
+
+ ;; store partial bytes in the output buffer
+ simd_store_avx_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2
+
+%%_bip_done:
+%endmacro ; CIPHER_BIP_REST
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
+
+%macro CRC32_REDUCE_128_TO_32 5
+%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value
+%define %%XCRC %2 ; [in/clobbered] XMM with CRC
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+
+%define %%XCRCKEY %%XT3
+
+ ;; compute crc of a 128-bit value
+ vmovdqa %%XCRCKEY, [rel rk5]
+
+ ;; 64b fold
+ vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00
+ vpsrldq %%XCRC, %%XCRC, 8
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+ ;; 32b fold
+ vpslldq %%XT1, %%XCRC, 4
+ vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+%%_crc_barrett:
+ ;; Barrett reduction
+ vpand %%XCRC, [rel mask2]
+ vmovdqa %%XT1, %%XCRC
+ vmovdqa %%XT2, %%XCRC
+ vmovdqa %%XCRCKEY, [rel rk7]
+
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x00
+ vpxor %%XCRC, %%XT2
+ vpand %%XCRC, [rel mask]
+ vmovdqa %%XT2, %%XCRC
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XT2
+ vpxor %%XCRC, %%XT1
+ vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
+ not DWORD(%%CRC)
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_128_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ ;; 128 to 64 bit reduction
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ vpclmulqdq %%XTMP, %%XMM_IN_OUT, %%K3_Q, 0x01 ; K3
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x01 ; K3
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+
+ vpand %%XMM_IN_OUT, [rel mask_out_top_64bits]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_64_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 32-bit XGEM headers
+%macro HEC_COMPUTE_32 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %4 ; [clobbered] temporary xmm register
+%define %%XT2 %5 ; [clobbered] temporary xmm register
+%define %%XT3 %6 ; [clobbered] temporary xmm register
+%define %%XT4 %7 ; [clobbered] temporary xmm register
+
+ mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr DWORD(%%GT1), 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and DWORD(%%HEC_IN_OUT), 0xffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovd %%XT1, DWORD(%%GT1)
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+ and DWORD(%%HEC_IN_OUT), 1
+ or DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 64-bit XGEM headers
+%macro HEC_COMPUTE_64 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+%define %%XT4 %6 ; [clobbered] temporary xmm register
+
+ mov %%GT1, %%HEC_IN_OUT
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr %%GT1, 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovq %%XT1, %%GT1
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or %%GT1, %%HEC_IN_OUT
+ popcnt %%HEC_IN_OUT, %%GT1
+ and %%HEC_IN_OUT, 1
+ or %%HEC_IN_OUT, %%GT1
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm of AES128-CTR, CRC and BIP
+;;; - this is master macro that implements encrypt/decrypt API
+;;; - calls other macros and directly uses registers
+;;; defined at the top of the file
+%macro AES128_CTR_PON 2
+%define %%DIR %1 ; [in] direction "ENC" or "DEC"
+%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR"
+
+ push r12
+ push r13
+ push r14
+%ifndef LINUX
+ push r15
+%endif
+
+%ifidn %%DIR, ENC
+ ;; by default write back CRC for encryption
+ mov DWORD(write_back_crc), 1
+%else
+ ;; mark decryption as finished
+ mov DWORD(decrypt_not_done), 1
+%endif
+ ;; START BIP (and update HEC if encrypt direction)
+ ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload)
+ ;; - convert it into LE
+ ;; - update HEC field in the header
+ ;; - convert it into BE
+ ;; - store back the header (with updated HEC)
+ ;; - start BIP
+ ;; (free to use tmp_1, tmp2 and tmp_3 at this stage)
+ mov tmp_2, [job + _src]
+ add tmp_2, [job + _hash_start_src_offset_in_bytes]
+ mov tmp_3, [tmp_2]
+%ifidn %%DIR, ENC
+ bswap tmp_3 ; go to LE
+ HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4
+ mov bytes_to_crc, tmp_3
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+ bswap tmp_3 ; go back to BE
+ mov [tmp_2], tmp_3
+ vmovq xbip, tmp_3
+%else
+ vmovq xbip, tmp_3
+ mov bytes_to_crc, tmp_3
+ bswap bytes_to_crc ; go to LE
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+%endif
+ cmp bytes_to_crc, 4
+ ja %%_crc_not_zero
+ ;; XGEM payload shorter or equal to 4 bytes
+%ifidn %%DIR, ENC
+ ;; On encryption, do not write Ethernet FCS back into the message
+ xor DWORD(write_back_crc), DWORD(write_back_crc)
+%else
+ ;; Mark decryption as not finished
+ ;; - Ethernet FCS is not computed
+ ;; - decrypt + BIP to be done at the end
+ xor DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+%endif
+ mov DWORD(bytes_to_crc), 4 ; it will be zero after the next line (avoid jmp)
+%%_crc_not_zero:
+ sub bytes_to_crc, 4 ; subtract size of the CRC itself
+
+%ifidn %%CIPH, CTR
+ ;; - read 16 bytes of IV
+ ;; - convert to little endian format
+ ;; - save least significant 8 bytes in GP register for overflow check
+ mov tmp, [job + _iv]
+ vmovdqu xcounter, [tmp]
+ vpshufb xcounter, [rel byteswap_const]
+ vmovq ctr_check, xcounter
+%endif
+
+ ;; get input buffer (after XGEM header)
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+
+ ;; get output buffer
+ mov p_out, [job + _dst]
+
+%ifidn %%CIPH, CTR
+ ;; get key pointers
+ mov p_keys, [job + _aes_enc_key_expanded]
+%endif
+
+ ;; initial CRC value
+ vmovdqa xcrc, [rel init_crc_value]
+
+ ;; load CRC constants
+ vmovdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey
+
+ ;; get number of bytes to cipher
+%ifidn %%CIPH, CTR
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+%else
+ ;; Message length to cipher is 0
+ ;; - length is obtained from message length to hash (BIP) minus XGEM header size
+ mov num_bytes, [job + _msg_len_to_hash_in_bytes]
+ sub num_bytes, 8
+%endif
+ or bytes_to_crc, bytes_to_crc
+ jz %%_crc_done
+
+ cmp bytes_to_crc, 32
+ jae %%_at_least_32_bytes
+
+%ifidn %%DIR, DEC
+ ;; decrypt the buffer first
+ mov tmp, num_bytes
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ ;; correct in/out pointers - go back to start of the buffers
+ mov tmp, num_bytes
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+%endif ; DECRYPTION
+
+ ;; less than 32 bytes
+ cmp bytes_to_crc, 16
+ je %%_exact_16_left
+ jl %%_less_than_16_left
+ ;; load the plaintext
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_crc_two_xmms
+
+%%_exact_16_left:
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_128_done
+
+%%_less_than_16_left:
+%ifidn %%DIR, ENC
+ simd_load_avx_15_1 xtmp1, p_in, bytes_to_crc
+%else
+ simd_load_avx_15_1 xtmp1, p_out, bytes_to_crc
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp1, [tmp + bytes_to_crc]
+ vpshufb xcrc, xtmp1
+ jmp %%_128_done
+
+%%_at_least_32_bytes:
+ cmp bytes_to_crc, 64
+ jb %%_crc_below_64_bytes
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+
+ align 16
+%%_main_loop_64:
+ cmp bytes_to_crc, 64
+ jb %%_main_loop
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop_64
+
+%%_crc_below_64_bytes:
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+
+ align 16
+%%_main_loop:
+ cmp bytes_to_crc, 16
+ jb %%_exit_loop
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop
+
+%%_exit_loop:
+
+%ifidn %%DIR, DEC
+ ;; decrypt rest of the message including CRC and optional padding
+ mov tmp, num_bytes
+
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+
+ or bytes_to_crc, bytes_to_crc
+ jz %%_128_done
+%endif ; DECRYPTION
+
+ ;; Partial bytes left - complete CRC calculation
+%%_crc_two_xmms:
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp2, [tmp + bytes_to_crc]
+ ;; @note: in case of in-place operation (default) this load is
+ ;; creating store-to-load problem.
+ ;; However, there is no easy way to address it at the moment.
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%else
+ vmovdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%endif
+ vmovdqa xtmp3, xcrc
+ vpshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc
+ vpxor xtmp2, [rel mask3]
+ vpshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
+
+ ;; data bytes_to_crc (top) blended with MSB bytes of CRC (bottom)
+ vpblendvb xtmp3, xtmp1, xtmp2
+
+ ;; final CRC calculation
+ vpclmulqdq xtmp1, xcrc, xcrckey, 0x01
+ vpclmulqdq xcrc, xcrc, xcrckey, 0x10
+ vpxor xcrc, xtmp3
+ vpxor xcrc, xtmp1
+
+%%_128_done:
+ CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey
+
+%%_crc_done:
+ ;; @todo - store-to-load problem in ENC case (to be fixed later)
+ ;; - store CRC in input buffer and authentication tag output
+ ;; - encrypt remaining bytes
+%ifidn %%DIR, ENC
+ or DWORD(write_back_crc), DWORD(write_back_crc)
+ jz %%_skip_crc_write_back
+ mov [p_in + bytes_to_crc], DWORD(ethernet_fcs)
+%%_skip_crc_write_back:
+%endif
+ mov tmp, [job + _auth_tag_output]
+ mov [tmp + 4], DWORD(ethernet_fcs)
+
+ or num_bytes, num_bytes
+ jz %%_do_not_cipher_the_rest
+
+ ;; encrypt rest of the message
+ ;; - partial bytes including CRC and optional padding
+ ;; decrypt rest of the message
+ ;; - this may only happen when XGEM payload is short and padding is added
+%ifidn %%DIR, DEC
+ or DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+ jnz %%_do_not_cipher_the_rest
+%endif
+ CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+%%_do_not_cipher_the_rest:
+
+ ;; finalize BIP
+ vpsrldq xtmp1, xbip, 4
+ vpsrldq xtmp2, xbip, 8
+ vpsrldq xtmp3, xbip, 12
+ vpxor xtmp1, xtmp1, xtmp2
+ vpxor xbip, xbip, xtmp3
+ vpxor xbip, xbip, xtmp1
+ vmovd [tmp], xbip ; tmp already holds _auth_tag_output
+
+ ;; set job status
+ or dword [job + _status], STS_COMPLETED
+
+ ;; return job
+ mov rax, job
+
+%ifndef LINUX
+ pop r15
+%endif
+ pop r14
+ pop r13
+ pop r12
+%endmacro ; AES128_CTR_PON
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; submit_job_pon_enc_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_avx,function,internal)
+submit_job_pon_enc_avx:
+ AES128_CTR_PON ENC, CTR
+ ret
+
+;;; submit_job_pon_dec_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_avx,function,internal)
+submit_job_pon_dec_avx:
+ AES128_CTR_PON DEC, CTR
+ ret
+
+;;; submit_job_pon_enc_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_no_ctr_avx,function,internal)
+submit_job_pon_enc_no_ctr_avx:
+ AES128_CTR_PON ENC, NO_CTR
+ ret
+
+;;; submit_job_pon_dec_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_no_ctr_avx,function,internal)
+submit_job_pon_dec_no_ctr_avx:
+ AES128_CTR_PON DEC, NO_CTR
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm
new file mode 100644
index 000000000..b850a227b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm
@@ -0,0 +1,434 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+section .data
+default rel
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+;; rbx, rsi, rdi, rbp, r12-r15 left intact
+;; This version is not safe to call from C/C++
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ must be an odd multiple of 8
+%define FRAMESZ 16*16 + 8
+
+%define VMOVPS vmovdqu
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks);
+; arg 1 : rcx : pointer to args
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_mult_avx,function,internal)
+sha1_mult_avx:
+
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ VMOVPS T2,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T4,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*4+0)*16],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*4+1)*16],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*4+2)*16],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [rel K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [rel K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [rel K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear all stack containing part of message
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep 16
+ vmovdqa [rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, FRAMESZ
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
new file mode 100644
index 000000000..090285e54
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
@@ -0,0 +1,501 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; SHA1 code, hybrid, rolled, interleaved
+; Uses AVX instructions
+%include "include/os.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%ifdef LINUX
+%define INP rdi ; 1st arg
+%define CTX rsi ; 2nd arg
+%define REG3 edx
+%define REG4 ecx
+%else
+%define INP rcx ; 1st arg
+%define CTX rdx ; 2nd arg
+%define REG3 edi
+%define REG4 esi
+%endif
+
+%define FRAMESZ 3*16 + 1*8
+%define _RSP FRAMESZ-1*8 + rsp
+
+%define a eax
+%define b ebx
+%define c REG3
+%define d REG4
+%define e r8d
+%define T1 r9d
+%define f r10d
+%define RND r11d
+%define g r12d
+%define h r13d
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XK xmm2
+
+%xdefine X0 xmm3
+%xdefine X1 xmm4
+%xdefine X2 xmm5
+%xdefine X3 xmm6
+%xdefine X4 xmm7
+
+%define XFER xmm8
+
+%define SZ 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X_
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regC
+ xor %%regF,%%regD
+ and %%regF,%%regB
+ xor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regD
+ xor %%regF,%%regC
+ xor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regB
+ mov %%regT,%%regB
+ or %%regF,%%regC
+ and %%regT,%%regC
+ and %%regF,%%regD
+ or %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+;; input is T1
+%macro ROUND 1
+%define %%MAGIC %1
+ add e,T1
+ mov T1,a
+ rol T1,5
+ add e,T1
+ %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+%endmacro
+
+%macro do_4i 1
+ vpaddd XFER, XK, X0
+ vpextrd T1, XFER, 0
+ ;ROUND %1
+ add e,T1
+ ;SCHEDULE_4
+ vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14]
+ mov T1,a
+ rol T1,5
+ vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16]
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+
+ ;; Finish low half
+ rol b,30
+ vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA}
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 1
+ ;ROUND %1
+ add e,T1
+ vpxor X4, X4, XTMP0
+ mov T1,a
+ rol T1,5
+ ;; rotate X4 left 1
+ vpsrld XTMP1, X4, (32-1)
+ add e,T1
+ vpslld X4, X4, 1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA}
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 2
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+
+ ;; Finish high half
+ vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx}
+ rol T1,5
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; rotate XTMP0 left 1
+ vpsrld XTMP1, XTMP0, (32-1)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 3
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+ vpslld XTMP0, XTMP0, 1
+ rol T1,5
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; COMBINE HALVES
+ vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
+ rol b,30
+ add h,e
+
+ rotate_Xs
+ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_block_avx(void *input_data, UINT32 digest[5])
+;; arg 1 : (in) pointer to input data
+;; arg 2 : (in/out) pointer to read/write digest
+MKGLOBAL(sha1_block_avx,function,internal)
+align 32
+sha1_block_avx:
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+
+ vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+%ifndef LINUX
+ mov rax,rsp ; copy rsp
+ sub rsp,FRAMESZ
+ and rsp,-16 ; align stack frame
+ mov [_RSP],rax ; save copy of rsp
+ vmovdqa [rsp + 0 * 16], xmm6
+ vmovdqa [rsp + 1 * 16], xmm7
+ vmovdqa [rsp + 2 * 16], xmm8
+%endif
+
+ VMOVDQ X0, [INP + 0*16]
+ VMOVDQ X1, [INP + 1*16]
+
+ ;; load next message block
+ VMOVDQ X2, [INP + 2*16]
+ VMOVDQ X3, [INP + 3*16]
+
+ ;; set up a-f based on h0-h4
+ ;; byte swap first 16 dwords
+ mov a, [SZ*0 + CTX]
+ vpshufb X0, XTMP0
+ mov b, [SZ*1 + CTX]
+ vpshufb X1, XTMP0
+ mov c, [SZ*2 + CTX]
+ vpshufb X2, XTMP0
+ mov d, [SZ*3 + CTX]
+ vpshufb X3, XTMP0
+ mov e, [SZ*4 + CTX]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K00_19]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop1_5
+align 16
+loop1:
+
+ do_4i MAGIC_F0
+
+loop1_5:
+ do_4i MAGIC_F0
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K20_39]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop2_5
+align 16
+loop2:
+
+ do_4i MAGIC_F1
+
+loop2_5:
+ do_4i MAGIC_F1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K40_59]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop3_5
+align 16
+loop3:
+
+ do_4i MAGIC_F2
+
+loop3_5:
+ do_4i MAGIC_F2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop3
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 60-79
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K60_79]
+
+ do_4i MAGIC_F3
+
+ vpaddd XFER, XK, X0
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X1
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X2
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X3
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ ;; update result digest h0-h4
+ add [SZ*0 + CTX], a
+ add [SZ*1 + CTX], b
+ add [SZ*2 + CTX], c
+ add [SZ*3 + CTX], d
+ add [SZ*4 + CTX], e
+
+%ifndef LINUX
+ vmovdqa xmm8, [rsp + 2 * 16]
+ vmovdqa xmm7, [rsp + 1 * 16]
+ vmovdqa xmm6, [rsp + 0 * 16]
+
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + 0 * 16], xmm0
+ vmovdqa [rsp + 1 * 16], xmm0
+ vmovdqa [rsp + 2 * 16], xmm0
+%endif
+
+ mov rsp,[_RSP]
+%endif ;; LINUX
+
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm
new file mode 100644
index 000000000..57d997dd3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha224_block_avx
+
+%include "avx/sha256_one_block_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
new file mode 100644
index 000000000..9c96f036b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
@@ -0,0 +1,553 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+section .data
+default rel
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+ dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+ dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
+
+section .text
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro MY_ROR 2
+ shld %1,%1,(32-(%2))
+%endm
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER xmm9
+%define XTMP5 xmm11
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm13
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c ecx
+%define d r8d
+%define e edx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c edi
+%define d esi
+%define e r8d
+
+%endif
+%define TBL rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 7
+%endif
+_XFER: reso 1
+endstruc
+
+%ifndef FUNC
+%define FUNC sha256_block_avx
+%endif
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ ;vmovdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ ;vmovdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpsrld XTMP2, XTMP1, 7
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+
+ vpslld XTMP3, XTMP1, (32-7)
+
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+
+ vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
+
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+
+
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+
+ vpsrld XTMP2, XTMP1,18
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+
+ vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+
+ vpslld XTMP1, XTMP1, (32-18)
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP3, XTMP3, XTMP1
+
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+
+ vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+
+ ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP2, XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP2, XTMP2, XTMP3
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ ;; load initial digest
+ mov a, [4*0 + CTX]
+ mov b, [4*1 + CTX]
+ mov c, [4*2 + CTX]
+ mov d, [4*3 + CTX]
+ mov e, [4*4 + CTX]
+ mov f, [4*5 + CTX]
+ mov g, [4*6 + CTX]
+ mov h, [4*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa SHUF_00BA, [rel _SHUF_00BA]
+ vmovdqa SHUF_DC00, [rel _SHUF_DC00]
+
+ lea TBL,[rel K256]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+align 16
+loop1:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 2*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 3*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd XFER, X1, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X0, X2
+ vmovdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ add [4*0 + CTX], a
+ add [4*1 + CTX], b
+ add [4*2 + CTX], c
+ add [4*3 + CTX], d
+ add [4*4 + CTX], e
+ add [4*5 + CTX], f
+ add [4*6 + CTX], g
+ add [4*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm
new file mode 100644
index 000000000..dddc5df28
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha384_block_avx
+
+%include "avx/sha512_one_block_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm
new file mode 100644
index 000000000..040518e76
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm
@@ -0,0 +1,473 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%ifndef FUNC
+%define FUNC sha512_block_avx
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro MY_ROR 2
+shld %1,%1,(64-(%2))
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+%define X4 xmm8
+%define X5 xmm9
+%define X6 xmm10
+%define X7 xmm11
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XFER xmm13
+
+%define BYTE_FLIP_MASK xmm12
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c rcx
+%define d r8
+%define e rdx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c rdi
+%define d rsi
+%define e r8
+
+%endif
+%define TBL rbp
+%define a rax
+%define b rbx
+
+%define f r9
+%define g r10
+%define h r11
+
+%define y0 r13
+%define y1 r14
+%define y2 r15
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 8
+%endif
+_XFER: reso 1
+endstruc
+
+
+; rotate_Xs
+; Rotate values of symbols X0...X7
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X5
+%xdefine X5 X6
+%xdefine X6 X7
+%xdefine X7 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro TWO_ROUNDS_AND_SCHED 0
+
+ vpalignr XTMP0, X5, X4, 8 ; XTMP0 = W[-7]
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ vpaddq XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ ;; compute s0
+ vpalignr XTMP1, X1, X0, 8 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ vpsllq XTMP2, XTMP1, (64-1)
+ xor y2, g ; y2 = f^g
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ vpsrlq XTMP3, XTMP1, 1
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ vpor XTMP2, XTMP2, XTMP3 ; XTMP2 = W[-15] ror 1
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ vpsrlq XTMP3, XTMP1, 8
+ add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ mov y0, a ; y0 = a
+ vpsllq X0, XTMP1, (64-8)
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ vpor X0, X0, XTMP3
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ vpsrlq XTMP1, XTMP1, 7 ; X0 = W[-15] >> 7
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ vpxor XTMP1, XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8
+ add h, y0 ; h = t1 + S0 + MAJ
+ vpxor XTMP1, XTMP1, X0 ; XTMP1 = s0
+
+
+ROTATE_ARGS
+ ;; compute s1
+ vpaddq XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ vpsllq XTMP3, X7, (64-19)
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ vpsrlq X0, X7, 19
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ vpor XTMP3, XTMP3, X0 ; XTMP3 = W[-2] ror 19
+ xor y2, g ; y2 = f^g
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ vpsllq XTMP2, X7, (64-61)
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ vpsrlq XTMP1, X7, 61
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ vpor XTMP2, XTMP2, XTMP1 ; XTMP2 = W[-2] ror 61
+ add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ mov y0, a ; y0 = a
+ vpsrlq XTMP1, X7, 6 ; XTMP1 = W[-2] >> 6
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ vpxor XTMP1, XTMP1, XTMP2
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ vpxor X0, XTMP3, XTMP1 ; X0 = s1
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ vpaddq X0, X0, XTMP0 ; X0 = {W[1], W[0]}
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 8]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6))
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+section .data
+default rel
+align 64
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT64 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ ;; load initial digest
+ mov a, [8*0 + CTX]
+ mov b, [8*1 + CTX]
+ mov c, [8*2 + CTX]
+ mov d, [8*3 + CTX]
+ mov e, [8*4 + CTX]
+ mov f, [8*5 + CTX]
+ mov g, [8*6 + CTX]
+ mov h, [8*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ lea TBL,[rel K512]
+
+ ;; byte swap first 16 qwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK
+
+ ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds
+ mov SRND, 4
+align 16
+loop1:
+
+%assign i 0
+%rep 7
+ vpaddq XFER, X0, [TBL + i*16]
+ vmovdqa [rsp + _XFER], XFER
+ TWO_ROUNDS_AND_SCHED
+%assign i (i+1)
+%endrep
+
+ vpaddq XFER, X0, [TBL + 7*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 8*16
+ TWO_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+ jmp loop2a
+loop2:
+ vmovdqa X0, X4
+ vmovdqa X1, X5
+ vmovdqa X2, X6
+ vmovdqa X3, X7
+
+loop2a:
+ vpaddq X0, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X1, X1, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], X1
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X2, X2, [TBL + 2*16]
+ vmovdqa [rsp + _XFER], X2
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X3, X3, [TBL + 3*16]
+ vmovdqa [rsp + _XFER], X3
+ add TBL, 4*16
+ DO_ROUND 0
+ DO_ROUND 1
+
+ sub SRND, 1
+ jne loop2
+
+ add [8*0 + CTX], a
+ add [8*1 + CTX], b
+ add [8*2 + CTX], c
+ add [8*3 + CTX], d
+ add [8*4 + CTX], e
+ add [8*5 + CTX], f
+ add [8*6 + CTX], g
+ add [8*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm
new file mode 100644
index 000000000..d7d712e2c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm
@@ -0,0 +1,381 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+extern K512_2
+
+section .data
+default rel
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ ;ddq 0x18191a1b1c1d1e1f1011121314151617
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+%define arg1 rdi
+%define arg2 rsi
+%else ; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1
+ vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0
+%endm
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+
+;; SHA512_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[2];
+;;
+
+;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks)
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha512_x2_avx,function,internal)
+align 32
+sha512_x2_avx:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel K512_2]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ2], a
+ vmovdqa [rsp + _DIGEST + 1*SZ2], b
+ vmovdqa [rsp + _DIGEST + 2*SZ2], c
+ vmovdqa [rsp + _DIGEST + 3*SZ2], d
+ vmovdqa [rsp + _DIGEST + 4*SZ2], e
+ vmovdqa [rsp + _DIGEST + 5*SZ2], f
+ vmovdqa [rsp + _DIGEST + 6*SZ2], g
+ vmovdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ VMOVPD TT2,[inp1+IDX+i*16]
+
+ TRANSPOSE TT0, TT2, TT1
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 8 * 16
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ2]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ2]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ2]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ2]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ2]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ2]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ2]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame ((16 + 8)*16 bytes)
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA512_DIGEST_WORDS)
+ vmovdqa [rsp + i*SZ2], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, STACK_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm
new file mode 100644
index 000000000..c1895a3f5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm
@@ -0,0 +1,391 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA256 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r12 r14 r15
+;;
+;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+extern K256_4
+
+%ifdef LINUX
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ4
+
+; Define stack usage
+struc STACK
+_DATA: resb SZ4 * 16
+_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPS vmovups
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ ;vmovdqa %%tmp, %%reg
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+;; SHA256_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[4];
+;;
+
+;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha_256_mult_avx,function,internal)
+align 16
+sha_256_mult_avx:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel K256_4]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPS TT2,[inp0+IDX+i*16]
+ VMOVPS TT1,[inp1+IDX+i*16]
+ VMOVPS TT4,[inp2+IDX+i*16]
+ VMOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha256 + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha256 + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha256 + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha256 + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame ((16 + 8)*16 bytes)
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA256_DIGEST_WORDS)
+ vmovdqa [rsp + i*SZ4], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, STACK_size
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c
new file mode 100644
index 000000000..8c6995fb8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define AVX
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_avx
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_avx
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_avx
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_avx
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_avx
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_avx
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_avx
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_avx
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_avx
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_avx
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_avx
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "include/snow3g_common.h"
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
new file mode 100755
index 000000000..e7c6bad8a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
@@ -0,0 +1,1146 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_avx
+
+section .data
+default rel
+align 64
+S0:
+db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+mask31:
+dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+
+align 16
+bit_reverse_table_l:
+db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f
+
+align 16
+bit_reverse_table_h:
+db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
+
+align 16
+bit_reverse_and_table:
+db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+
+align 16
+data_mask_64bits:
+dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+
+bit_mask_table:
+db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
+
+
+section .text
+align 64
+
+%define OFFSET_FR1 (16*4)
+%define OFFSET_FR2 (17*4)
+%define OFFSET_BRC_X0 (18*4)
+%define OFFSET_BRC_X1 (19*4)
+%define OFFSET_BRC_X2 (20*4)
+%define OFFSET_BRC_X3 (21*4)
+
+%define MASK31 xmm12
+
+%define OFS_R1 (16*(4*4))
+%define OFS_R2 (OFS_R1 + (4*4))
+%define OFS_X0 (OFS_R2 + (4*4))
+%define OFS_X1 (OFS_X0 + (4*4))
+%define OFS_X2 (OFS_X1 + (4*4))
+%define OFS_X3 (OFS_X2 + (4*4))
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET XMM_STORAGE
+
+%macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+%ifidn __OUTPUT_FORMAT__, win64
+ push rdi
+ push rsi
+%endif
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm7
+ vmovdqu [rsp + 2*16],xmm8
+ vmovdqu [rsp + 3*16],xmm9
+ vmovdqu [rsp + 4*16],xmm10
+ vmovdqu [rsp + 5*16],xmm11
+ vmovdqu [rsp + 6*16],xmm12
+ vmovdqu [rsp + 7*16],xmm13
+ vmovdqu [rsp + 8*16],xmm14
+ vmovdqu [rsp + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 9*16]
+ vmovdqu xmm14, [rsp + 8*16]
+ vmovdqu xmm13, [rsp + 7*16]
+ vmovdqu xmm12, [rsp + 6*16]
+ vmovdqu xmm11, [rsp + 5*16]
+ vmovdqu xmm10, [rsp + 4*16]
+ vmovdqu xmm9, [rsp + 3*16]
+ vmovdqu xmm8, [rsp + 2*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+%endif
+ mov rsp, r14
+%ifidn __OUTPUT_FORMAT__, win64
+ pop rsi
+ pop rdi
+%endif
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;
+;; make_u31()
+;;
+%macro make_u31 4
+
+%define %%Rt %1
+%define %%Ke %2
+%define %%Ek %3
+%define %%Iv %4
+ xor %%Rt, %%Rt
+ shrd %%Rt, %%Iv, 8
+ shrd %%Rt, %%Ek, 15
+ shrd %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+; bits_reorg4()
+;
+; params
+; %1 - round number
+; rax - LFSR pointer
+; uses
+;
+; return
+;
+%macro bits_reorg4 1
+ ;
+ ; xmm15 = LFSR_S15
+ ; xmm14 = LFSR_S14
+ ; xmm11 = LFSR_S11
+ ; xmm9 = LFSR_S9
+ ; xmm7 = LFSR_S7
+ ; xmm5 = LFSR_S5
+ ; xmm2 = LFSR_S2
+ ; xmm0 = LFSR_S0
+ ;
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+ vmovdqa xmm14, [rax + ((14 + %1) % 16)*16]
+ vmovdqa xmm11, [rax + ((11 + %1) % 16)*16]
+ vmovdqa xmm9, [rax + (( 9 + %1) % 16)*16]
+ vmovdqa xmm7, [rax + (( 7 + %1) % 16)*16]
+ vmovdqa xmm5, [rax + (( 5 + %1) % 16)*16]
+ vmovdqa xmm2, [rax + (( 2 + %1) % 16)*16]
+ vmovdqa xmm0, [rax + (( 0 + %1) % 16)*16]
+
+ vpxor xmm1, xmm1
+ vpslld xmm15, 1
+ vpblendw xmm3, xmm14, xmm1, 0xAA
+ vpblendw xmm15, xmm3, xmm15, 0xAA
+
+ vmovdqa [rax + OFS_X0], xmm15 ; BRC_X0
+ vpslld xmm11, 16
+ vpsrld xmm9, 15
+ vpor xmm11, xmm9
+ vmovdqa [rax + OFS_X1], xmm11 ; BRC_X1
+ vpslld xmm7, 16
+ vpsrld xmm5, 15
+ vpor xmm7, xmm5
+ vmovdqa [rax + OFS_X2], xmm7 ; BRC_X2
+ vpslld xmm2, 16
+ vpsrld xmm0, 15
+ vpor xmm2, xmm0
+ vmovdqa [rax + OFS_X3], xmm2 ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 2
+%define %%table %1 ; [in] Pointer to table to look up
+%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+ ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+ ;; and registers for param passing and return (4 regs, OS dependent)
+ ;; (6*16 + 6*8 = 144 bytes)
+ sub rsp, 144
+
+ vmovdqu [rsp], xmm0
+ vmovdqu [rsp + 16], xmm1
+ vmovdqu [rsp + 32], xmm2
+ vmovdqu [rsp + 48], xmm3
+ vmovdqu [rsp + 64], xmm4
+ vmovdqu [rsp + 80], xmm5
+ mov [rsp + 96], r9
+ mov [rsp + 104], r10
+
+%ifdef LINUX
+ mov [rsp + 112], rdi
+ mov [rsp + 120], rsi
+ mov [rsp + 128], rdx
+ mov rdi, %%table
+ mov rsi, %%idx_val
+ mov rdx, 256
+%else
+%ifnidni %%idx_val, rcx
+ mov [rsp + 112], rcx
+%endif
+%ifnidni %%idx_val, rdx
+ mov [rsp + 120], rdx
+%endif
+%ifnidni %%idx_val, r8
+ mov [rsp + 128], r8
+%endif
+
+ mov rdx, %%idx_val
+ mov rcx, %%table
+ mov r8, 256
+%endif
+ mov [rsp + 136], rax
+
+ call lookup_8bit_avx
+
+ ;; Restore all registers
+ vmovdqu xmm0, [rsp]
+ vmovdqu xmm1, [rsp + 16]
+ vmovdqu xmm2, [rsp + 32]
+ vmovdqu xmm3, [rsp + 48]
+ vmovdqu xmm4, [rsp + 64]
+ vmovdqu xmm5, [rsp + 80]
+ mov r9, [rsp + 96]
+ mov r10, [rsp + 104]
+
+%ifdef LINUX
+ mov rdi, [rsp + 112]
+ mov rsi, [rsp + 120]
+ mov rdx, [rsp + 128]
+%else
+%ifnidni %%idx_val, rcx
+ mov rcx, [rsp + 112]
+%endif
+%ifnidni %%idx_val, rdx
+ mov rdx, [rsp + 120]
+%endif
+%ifnidni %%idx_val, rcx
+ mov r8, [rsp + 128]
+%endif
+%endif
+
+ ;; Move returned value from lookup function, before restoring rax
+ mov DWORD(%%idx_val), eax
+ mov rax, [rsp + 136]
+
+ add rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+ movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+; sbox_lkup()
+;
+; params
+; %1 R1/R2 table offset
+; %2 R1/R2 entry offset
+; %3 xmm reg name
+; uses
+; rcx,rdx,r8,r9,r10,rsi
+; return
+;
+%macro sbox_lkup 3
+ vpextrb rcx, %3, (0 + (%2 * 4))
+ lookup_single_sbox rsi, rcx
+ vpextrb rdx, %3, (1 + (%2 * 4))
+ lookup_single_sbox rdi, rdx
+
+ xor r10, r10
+ vpextrb r8, %3, (2 + (%2 * 4))
+ lookup_single_sbox rsi, r8
+ vpextrb r9, %3, (3 + (%2 * 4))
+ lookup_single_sbox rdi, r9
+
+ shrd r10d, ecx, 8
+ shrd r10d, edx, 8
+ shrd r10d, r8d, 8
+ shrd r10d, r9d, 8
+ mov [rax + %1 + (%2 * 4)], r10d
+%endmacro
+
+
+;
+; rot_mod32()
+;
+; uses xmm7
+;
+%macro rot_mod32 3
+ vpslld %1, %2, %3
+ vpsrld xmm7, %2, (32 - %3)
+
+ vpor %1, xmm7
+%endmacro
+
+
+;
+; nonlin_fun4()
+;
+; params
+; %1 == 1, then calculate W
+; uses
+;
+; return
+; xmm0 = W value, updates F_R1[] / F_R2[]
+;
+%macro nonlin_fun4 1
+
+%if (%1 == 1)
+ vmovdqa xmm0, [rax + OFS_X0]
+ vpxor xmm0, [rax + OFS_R1]
+ vpaddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+ ;
+ vmovdqa xmm1, [rax + OFS_R1]
+ vmovdqa xmm2, [rax + OFS_R2]
+ vpaddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1
+ vpxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2
+ ;
+
+ vpslld xmm3, xmm1, 16
+ vpsrld xmm4, xmm1, 16
+ vpslld xmm5, xmm2, 16
+ vpsrld xmm6, xmm2, 16
+ vpor xmm1, xmm3, xmm6
+ vpor xmm2, xmm4, xmm5
+
+ ;
+ rot_mod32 xmm3, xmm1, 2
+ rot_mod32 xmm4, xmm1, 10
+ rot_mod32 xmm5, xmm1, 18
+ rot_mod32 xmm6, xmm1, 24
+ vpxor xmm1, xmm3
+ vpxor xmm1, xmm4
+ vpxor xmm1, xmm5
+ vpxor xmm1, xmm6 ; XMM1 = U = L1(P)
+
+ sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0]
+ sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1]
+ sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2]
+ sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3]
+ ;
+ rot_mod32 xmm3, xmm2, 8
+ rot_mod32 xmm4, xmm2, 14
+ rot_mod32 xmm5, xmm2, 22
+ rot_mod32 xmm6, xmm2, 30
+ vpxor xmm2, xmm3
+ vpxor xmm2, xmm4
+ vpxor xmm2, xmm5
+ vpxor xmm2, xmm6 ; XMM2 = V = L2(Q)
+ ;
+
+ sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0]
+ sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1]
+ sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2]
+ sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3]
+%endmacro
+
+
+;
+; store_kstr4()
+;
+; params
+;
+; uses
+; xmm0 as input
+; return
+;
+%macro store_kstr4 0
+ vpxor xmm0, [rax + OFS_X3]
+ vpextrd r15d, xmm0, 3
+ pop r9 ; *pKeyStr4
+ vpextrd r14d, xmm0, 2
+ pop r8 ; *pKeyStr3
+ vpextrd r13d, xmm0, 1
+ pop rdx ; *pKeyStr2
+ vpextrd r12d, xmm0, 0
+ pop rcx ; *pKeyStr1
+ mov [r9], r15d
+ mov [r8], r14d
+ mov [rdx], r13d
+ mov [rcx], r12d
+ add rcx, 4
+ add rdx, 4
+ add r8, 4
+ add r9, 4
+ push rcx
+ push rdx
+ push r8
+ push r9
+%endmacro
+
+
+;
+; add_mod31()
+; add two 32-bit args and reduce mod (2^31-1)
+; params
+; %1 - arg1/res
+; %2 - arg2
+; uses
+; xmm2
+; return
+; %1
+%macro add_mod31 2
+ vpaddd %1, %2
+ vpsrld xmm2, %1, 31
+ vpand %1, MASK31
+ vpaddd %1, xmm2
+%endmacro
+
+
+;
+; rot_mod31()
+; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1)
+; params
+; %1 - arg
+; %2 - # of bits
+; uses
+; xmm2
+; return
+; %1
+%macro rot_mod31 2
+
+ vpslld xmm2, %1, %2
+ vpsrld %1, %1, (31 - %2)
+
+ vpor %1, xmm2
+ vpand %1, MASK31
+%endmacro
+
+
+;
+; lfsr_updt4()
+;
+; params
+; %1 - round number
+; uses
+; xmm0 as input (ZERO or W)
+; return
+;
+%macro lfsr_updt4 1
+ ;
+ ; xmm1 = LFSR_S0
+ ; xmm4 = LFSR_S4
+ ; xmm10 = LFSR_S10
+ ; xmm13 = LFSR_S13
+ ; xmm15 = LFSR_S15
+ ;
+ vpxor xmm3, xmm3
+ vmovdqa xmm1, [rax + (( 0 + %1) % 16)*16]
+ vmovdqa xmm4, [rax + (( 4 + %1) % 16)*16]
+ vmovdqa xmm10, [rax + ((10 + %1) % 16)*16]
+ vmovdqa xmm13, [rax + ((13 + %1) % 16)*16]
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+
+ ; Calculate LFSR feedback
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm1, 8
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm4, 20
+ add_mod31 xmm0, xmm4
+ rot_mod31 xmm10, 21
+ add_mod31 xmm0, xmm10
+ rot_mod31 xmm13, 17
+ add_mod31 xmm0, xmm13
+ rot_mod31 xmm15, 15
+ add_mod31 xmm0, xmm15
+
+
+
+ vmovdqa [rax + (( 0 + %1) % 16)*16], xmm0
+
+ ; LFSR_S16 = (LFSR_S15++) = eax
+%endmacro
+
+
+;
+; key_expand_4()
+;
+%macro key_expand_4 2
+ movzx r8d, byte [rdi + (%1 + 0)]
+ movzx r9d, word [rbx + ((%1 + 0)*2)]
+ movzx r10d, byte [rsi + (%1 + 0)]
+ make_u31 r11d, r8d, r9d, r10d
+ mov [rax + (((%1 + 0)*16)+(%2*4))], r11d
+
+ movzx r12d, byte [rdi + (%1 + 1)]
+ movzx r13d, word [rbx + ((%1 + 1)*2)]
+ movzx r14d, byte [rsi + (%1 + 1)]
+ make_u31 r15d, r12d, r13d, r14d
+ mov [rax + (((%1 + 1)*16)+(%2*4))], r15d
+%endmacro
+
+
+MKGLOBAL(asm_ZucInitialization_4_avx,function,internal)
+asm_ZucInitialization_4_avx:
+
+%ifdef LINUX
+ %define pKe rdi
+ %define pIv rsi
+ %define pState rdx
+%else
+ %define pKe rcx
+ %define pIv rdx
+ %define pState r8
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push rdi
+ push rsi
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdx
+
+ lea rax, [pState] ; load pointer to LFSR
+ push pState ; Save LFSR Pointer to stack
+
+ ; setup the key pointer for first buffer key expand
+ mov rbx, [pKe] ; load the pointer to the array of keys into rbx
+
+ push pKe ; save rdi (key pointer) to the stack
+ lea rdi, [rbx] ; load the pointer to the first key into rdi
+
+
+ ; setup the IV pointer for first buffer key expand
+ mov rcx, [pIv] ; load the pointer to the array of IV's
+ push pIv ; save the IV pointer to the stack
+ lea rsi, [rcx] ; load the first IV pointer
+
+ lea rbx, [EK_d] ; load D variables
+
+ ; Expand key packet 1
+ key_expand_4 0, 0
+ key_expand_4 2, 0
+ key_expand_4 4, 0
+ key_expand_4 6, 0
+ key_expand_4 8, 0
+ key_expand_4 10, 0
+ key_expand_4 12, 0
+ key_expand_4 14, 0
+
+
+ ;second packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+8] ; load offset to IV 2 in array
+ lea rsi, [rcx] ; load pointer to IV2
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+8] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+
+ lea rbx, [EK_d]
+
+ ; Expand key packet 2
+ key_expand_4 0, 1
+ key_expand_4 2, 1
+ key_expand_4 4, 1
+ key_expand_4 6, 1
+ key_expand_4 8, 1
+ key_expand_4 10, 1
+ key_expand_4 12, 1
+ key_expand_4 14, 1
+
+
+
+ ;Third packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+16] ; load offset to IV 3 in array
+ lea rsi, [rcx] ; load pointer to IV3
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+16] ; load offset to key 3 in array
+ lea rdi, [rcx] ; load pointer to Key 3
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+ lea rbx, [EK_d]
+ ; Expand key packet 3
+ key_expand_4 0, 2
+ key_expand_4 2, 2
+ key_expand_4 4, 2
+ key_expand_4 6, 2
+ key_expand_4 8, 2
+ key_expand_4 10, 2
+ key_expand_4 12, 2
+ key_expand_4 14, 2
+
+
+
+ ;fourth packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+24] ; load offset to IV 4 in array
+ lea rsi, [rcx] ; load pointer to IV4
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+24] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+ lea rbx, [EK_d]
+ ; Expand key packet 4
+ key_expand_4 0, 3
+ key_expand_4 2, 3
+ key_expand_4 4, 3
+ key_expand_4 6, 3
+ key_expand_4 8, 3
+ key_expand_4 10, 3
+ key_expand_4 12, 3
+ key_expand_4 14, 3
+
+ ; Set R1 and R2 to zero
+ ;xor r10, r10
+ ;xor r11, r11
+
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 N
+ nonlin_fun4 1
+ vpsrld xmm0,1 ; Shift out LSB of W
+
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero
+%assign N N+1
+%endrep
+
+ ; And once more, initial round from keygen phase = 33 times
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 0
+ nonlin_fun4 0
+
+ pop rdx
+ lea rax, [rdx]
+
+ vpxor xmm0, xmm0
+ lfsr_updt4 0
+
+
+
+ ; Restore non-volatile registers
+ pop rdx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ pop rbx
+
+ ret
+;
+;
+;
+;;
+;; void asm_ZucGenKeystream64B_4_avx(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4);
+;;
+;; WIN64
+;; RCX - pSta
+;; RDX - pKeyStr1
+;; R8 - pKeyStr2
+;; R9 - pKeyStr3
+;; Stack - pKeyStr4
+;;
+;; LIN64
+;; RDI - pSta
+;; RSI - pKeyStr1
+;; RDX - pKeyStr2
+;; RCX - pKeyStr3
+;; R8 - pKeyStr4
+;;
+MKGLOBAL(asm_ZucGenKeystream64B_4_avx,function,internal)
+asm_ZucGenKeystream64B_4_avx:
+
+%ifdef LINUX
+ %define pState rdi
+ %define pKS1 rsi
+ %define pKS2 rdx
+ %define pKS3 rcx
+ %define pKS4 r8
+%else
+ %define pState rcx
+ %define pKS1 rdx
+ %define pKS2 r8
+ %define pKS3 r9
+ %define pKS4 rax
+%endif
+
+%ifndef LINUX
+ mov rax, [rsp + 8*5] ; 5th parameter from stack
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+
+%ifndef LINUX
+ push rdi
+ push rsi
+%endif
+ ; Store 4 keystream pointers on the stack
+
+ push pKS1
+ push pKS2
+ push pKS3
+ push pKS4
+
+
+ ; Load state pointer in RAX
+ mov rax, pState
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+ bits_reorg4 N
+ nonlin_fun4 1
+ store_kstr4
+ vpxor xmm0, xmm0
+ lfsr_updt4 N
+%assign N N+1
+%endrep
+
+ ; Take keystream pointers off (#push = #pops)
+ pop rax
+ pop rax
+ pop rax
+ pop rax
+
+%ifndef LINUX
+ pop rsi
+ pop rdi
+%endif
+
+ ; Restore non-volatile registers
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+
+
+;;
+;; extern uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, uint64_t n_bits)
+;;
+;; Returns authentication update value to be XOR'ed with current authentication tag
+;;
+;; WIN64
+;; RCX - KS (key stream pointer)
+;; RDX - DATA (data pointer)
+;; R8 - N_BITS (number data bits to process)
+;; LIN64
+;; RDI - KS (key stream pointer)
+;; RSI - DATA (data pointer)
+;; RDX - N_BITS (number data bits to process)
+;;
+align 64
+MKGLOBAL(asm_Eia3RemainderAVX,function,internal)
+asm_Eia3RemainderAVX:
+
+%ifdef LINUX
+ %define KS rdi
+ %define DATA rsi
+ %define N_BITS rdx
+%else
+ %define KS rcx
+ %define DATA rdx
+ %define N_BITS r8
+%endif
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+ vpxor xmm9, xmm9
+
+%rep 3
+ cmp N_BITS, 128
+ jb Eia3RoundsAVX_dq_end
+
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm7
+
+ vmovdqa xmm2, xmm7
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm8, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm8, xmm1
+
+ vmovdqa xmm4, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm4, xmm2
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+ vmovdqu xmm3, [KS + (0*4)]
+ vmovdqu xmm4, [KS + (2*4)]
+ vpshufd xmm0, xmm3, 0x61
+ vpshufd xmm1, xmm4, 0x61
+
+ ;; - set up DATA
+ vmovdqa xmm2, xmm8
+ vpand xmm2, xmm10
+ vpshufd xmm3, xmm2, 0xdc
+ vmovdqa xmm4, xmm3
+
+ vpsrldq xmm8, 8
+ vpshufd xmm13, xmm8, 0xdc
+ vmovdqa xmm14, xmm13
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+ vpclmulqdq xmm3, xmm0, 0x00
+ vpclmulqdq xmm4, xmm0, 0x11
+ vpclmulqdq xmm13, xmm1, 0x00
+ vpclmulqdq xmm14, xmm1, 0x11
+
+ vpxor xmm3, xmm4
+ vpxor xmm13, xmm14
+ vpxor xmm9, xmm3
+ vpxor xmm9, xmm13
+ lea DATA, [DATA + 16]
+ lea KS, [KS + 16]
+ sub N_BITS, 128
+%endrep
+Eia3RoundsAVX_dq_end:
+
+%rep 3
+ cmp N_BITS, 32
+ jb Eia3RoundsAVX_dw_end
+
+ ;; swap dwords in KS
+ vmovq xmm1, [KS]
+ vpshufd xmm4, xmm1, 0xf1
+
+ ;; bit-reverse 4 bytes of data
+ vmovdqa xmm2, xmm7
+ vmovd xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm2
+
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm0, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm0, xmm1
+
+ vmovdqa xmm3, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm3, xmm2
+
+ vpor xmm0, xmm3
+
+ ;; rol & xor
+ vpclmulqdq xmm0, xmm4, 0
+ vpxor xmm9, xmm0
+
+ lea DATA, [DATA + 4]
+ lea KS, [KS + 4]
+ sub N_BITS, 32
+%endrep
+
+Eia3RoundsAVX_dw_end:
+ vmovq rax, xmm9
+ shr rax, 32
+
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ ;; get 64-bit key stream for the last data bits (less than 32)
+ mov KS, [KS]
+
+ ;; process remaining data bytes and bits
+Eia3RoundsAVX_byte_loop:
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ cmp N_BITS, 8
+ jb Eia3RoundsAVX_byte_partial
+
+ movzx r11, byte [DATA]
+ sub N_BITS, 8
+ jmp Eia3RoundsAVX_byte_read
+
+Eia3RoundsAVX_byte_partial:
+ ;; process remaining bits (up to 7)
+ lea r11, [bit_mask_table]
+ movzx r10, byte [r11 + N_BITS]
+ movzx r11, byte [DATA]
+ and r11, r10
+ xor N_BITS, N_BITS
+Eia3RoundsAVX_byte_read:
+
+%assign DATATEST 0x80
+%rep 8
+ xor r10, r10
+ test r11, DATATEST
+ cmovne r10, KS
+ xor rax, r10
+ rol KS, 1
+%assign DATATEST (DATATEST >> 1)
+%endrep ; byte boundary
+ lea DATA, [DATA + 1]
+ jmp Eia3RoundsAVX_byte_loop
+
+Eia3RoundsAVX_byte_loop_end:
+
+ ;; eax - holds the return value at this stage
+ FUNC_RESTORE
+
+ ret
+
+;;
+;;extern uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *KS, const void *DATA)
+;;
+;; Updates authentication tag T based on keystream KS and DATA.
+;; - it processes 64 bytes of DATA
+;; - reads data in 16 byte chunks and bit reverses them
+;; - reads and re-arranges KS
+;; - employs clmul for the XOR & ROL part
+;; - copies top 64 butes of KS to bottom (for the next round)
+;;
+;; WIN64
+;; RCX - T
+;; RDX - KS pointer to key stream (2 x 64 bytes)
+;;; R8 - DATA pointer to data
+;; LIN64
+;; RDI - T
+;; RSI - KS pointer to key stream (2 x 64 bytes)
+;; RDX - DATA pointer to data
+;;
+align 64
+MKGLOBAL(asm_Eia3Round64BAVX,function,internal)
+asm_Eia3Round64BAVX:
+
+%ifdef LINUX
+ %define T edi
+ %define KS rsi
+ %define DATA rdx
+%else
+ %define T ecx
+ %define KS rdx
+ %define DATA r8
+%endif
+
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+
+ vpxor xmm9, xmm9
+%assign I 0
+%rep 4
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA + 16*I]
+ vpand xmm1, xmm0, xmm7
+
+ vpandn xmm2, xmm7, xmm0
+ vpsrld xmm2, 4
+
+ vpshufb xmm8, xmm6, xmm1 ; bit reverse low nibbles (use high table)
+ vpshufb xmm4, xmm5, xmm2 ; bit reverse high nibbles (use low table)
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+%if I != 0
+ vmovdqa xmm11, xmm12
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%else
+ vmovdqu xmm11, [KS + (I*16) + (0*4)]
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%endif
+ vpalignr xmm13, xmm12, xmm11, 8
+ vpshufd xmm2, xmm11, 0x61
+ vpshufd xmm3, xmm13, 0x61
+
+ ;; - set up DATA
+ vpand xmm13, xmm10, xmm8
+ vpshufd xmm0, xmm13, 0xdc
+
+ vpsrldq xmm8, 8
+ vpshufd xmm1, xmm8, 0xdc
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+%if I != 0
+ vpclmulqdq xmm13, xmm0, xmm2, 0x00
+ vpclmulqdq xmm14, xmm0, xmm2, 0x11
+ vpclmulqdq xmm15, xmm1, xmm3, 0x00
+ vpclmulqdq xmm8, xmm1, xmm3, 0x11
+
+ vpxor xmm13, xmm14
+ vpxor xmm15, xmm8
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm15
+%else
+ vpclmulqdq xmm9, xmm0, xmm2, 0x00
+ vpclmulqdq xmm13, xmm0, xmm2, 0x11
+ vpclmulqdq xmm14, xmm1, xmm3, 0x00
+ vpclmulqdq xmm15, xmm1, xmm3, 0x11
+
+ vpxor xmm14, xmm15
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm14
+%endif
+
+
+%assign I (I + 1)
+%endrep
+
+ ;; - update T
+ vmovq rax, xmm9
+ shr rax, 32
+ xor eax, T
+
+ FUNC_RESTORE
+
+ ret
+
+
+;----------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c
new file mode 100755
index 000000000..b3ba2de81
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c
@@ -0,0 +1,548 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+* zuc_avx.c
+*-----------------------------------------------------------------------
+* An implementation of ZUC, the core algorithm for the
+* 3GPP Confidentiality and Integrity algorithms.
+*
+*-----------------------------------------------------------------------*/
+
+#include <string.h>
+
+#include "include/zuc_internal.h"
+#include "include/wireless_common.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+#include "intel-ipsec-mb.h"
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+static inline
+void _zuc_eea3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t length)
+{
+ DECLARE_ALIGNED(ZucState_t zucState, 64);
+ DECLARE_ALIGNED(uint8_t keyStream[64], 64);
+ /* buffer to store 64 bytes of keystream */
+ DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+ DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+
+ const uint64_t *pIn64 = NULL;
+ const uint8_t *pIn8 = NULL;
+ uint8_t *pOut8 = NULL;
+ uint64_t *pOut64 = NULL, *pKeyStream64 = NULL;
+ uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL;
+
+ uint32_t numKeyStreamsPerPkt = length/ ZUC_KEYSTR_LEN;
+ uint32_t numBytesLeftOver = length % ZUC_KEYSTR_LEN;
+
+ /* need to set the LFSR state to zero */
+ memset(&zucState, 0, sizeof(ZucState_t));
+
+ /* initialize the zuc state */
+ asm_ZucInitialization(pKey, pIv, &(zucState));
+
+ /* Loop Over all the Quad-Words in input buffer and XOR with the 64bits
+ * of generated keystream */
+ pOut64 = (uint64_t *) pBufferOut;
+ pIn64 = (const uint64_t *) pBufferIn;
+
+ while (numKeyStreamsPerPkt--) {
+ /* Generate the key stream 64 bytes at a time */
+ asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+ /* XOR The Keystream generated with the input buffer here */
+ pKeyStream64 = (uint64_t *) keyStream;
+ asm_XorKeyStream64B_avx(pIn64, pOut64, pKeyStream64);
+ pIn64 += 8;
+ pOut64 += 8;
+ }
+
+ /* Check for remaining 0 to 63 bytes */
+ pIn8 = (const uint8_t *) pBufferIn;
+ pOut8 = (uint8_t *) pBufferOut;
+ if(numBytesLeftOver) {
+ asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+ /* copy the remaining bytes into temporary buffer and XOR with
+ * the 64-bytes of keystream. Then copy on the valid bytes back
+ * to the output buffer */
+
+ memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver],
+ numBytesLeftOver);
+ pKeyStream64 = (uint64_t *) &keyStream[0];
+ pTemp64 = (uint64_t *) &tempSrc[0];
+ pdstTemp64 = (uint64_t *) &tempDst[0];
+
+ asm_XorKeyStream64B_avx(pTemp64, pdstTemp64, pKeyStream64);
+ memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0],
+ numBytesLeftOver);
+
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(keyStream, sizeof(keyStream));
+ clear_mem(&zucState, sizeof(zucState));
+#endif
+}
+
+static inline
+void _zuc_eea3_4_buffer_avx(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t length[4])
+{
+ DECLARE_ALIGNED(ZucState4_t state, 64);
+ DECLARE_ALIGNED(ZucState_t singlePktState, 64);
+ unsigned int i = 0;
+ /* Calculate the minimum input packet size */
+ uint32_t bytes1 = (length[0] < length[1] ?
+ length[0] : length[1]);
+ uint32_t bytes2 = (length[2] < length[3] ?
+ length[2] : length[3]);
+ /* min number of bytes */
+ uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+ uint32_t numKeyStreamsPerPkt = bytes/ZUC_KEYSTR_LEN;
+ uint32_t remainBytes[4] = {0};
+ DECLARE_ALIGNED(uint8_t keyStr1[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr2[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr3[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr4[64], 64);
+ DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+ DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+ /* structure to store the 4 keys */
+ DECLARE_ALIGNED(ZucKey4_t keys, 64);
+ /* structure to store the 4 IV's */
+ DECLARE_ALIGNED(ZucIv4_t ivs, 64);
+ uint32_t numBytesLeftOver = 0;
+ const uint8_t *pTempBufInPtr = NULL;
+ uint8_t *pTempBufOutPtr = NULL;
+
+ const uint64_t *pIn64_0 = NULL;
+ const uint64_t *pIn64_1 = NULL;
+ const uint64_t *pIn64_2 = NULL;
+ const uint64_t *pIn64_3 = NULL;
+ uint64_t *pOut64_0 = NULL;
+ uint64_t *pOut64_1 = NULL;
+ uint64_t *pOut64_2 = NULL;
+ uint64_t *pOut64_3 = NULL;
+ uint64_t *pTempSrc64 = NULL;
+ uint64_t *pTempDst64 = NULL;
+ uint64_t *pKeyStream64 = NULL;
+
+ /* rounded down minimum length */
+ bytes = numKeyStreamsPerPkt * ZUC_KEYSTR_LEN;
+
+ /* Need to set the LFSR state to zero */
+ memset(&state, 0, sizeof(ZucState4_t));
+
+ /* Calculate the number of bytes left over for each packet */
+ for (i=0; i< 4; i++)
+ remainBytes[i] = length[i] - bytes;
+
+ /* Setup the Keys */
+ keys.pKey1 = pKey[0];
+ keys.pKey2 = pKey[1];
+ keys.pKey3 = pKey[2];
+ keys.pKey4 = pKey[3];
+
+ /* setup the IV's */
+ ivs.pIv1 = pIv[0];
+ ivs.pIv2 = pIv[1];
+ ivs.pIv3 = pIv[2];
+ ivs.pIv4 = pIv[3];
+
+ asm_ZucInitialization_4_avx( &keys, &ivs, &state);
+
+ pOut64_0 = (uint64_t *) pBufferOut[0];
+ pOut64_1 = (uint64_t *) pBufferOut[1];
+ pOut64_2 = (uint64_t *) pBufferOut[2];
+ pOut64_3 = (uint64_t *) pBufferOut[3];
+
+ pIn64_0 = (const uint64_t *) pBufferIn[0];
+ pIn64_1 = (const uint64_t *) pBufferIn[1];
+ pIn64_2 = (const uint64_t *) pBufferIn[2];
+ pIn64_3 = (const uint64_t *) pBufferIn[3];
+
+ /* Loop for 64 bytes at a time generating 4 key-streams per loop */
+ while (numKeyStreamsPerPkt) {
+ /* Generate 64 bytes at a time */
+ asm_ZucGenKeystream64B_4_avx(&state,
+ (uint32_t *) keyStr1,
+ (uint32_t *) keyStr2,
+ (uint32_t *) keyStr3,
+ (uint32_t *) keyStr4);
+
+ /* XOR the KeyStream with the input buffers and store in output
+ * buffer*/
+ pKeyStream64 = (uint64_t *) keyStr1;
+ asm_XorKeyStream64B_avx(pIn64_0, pOut64_0, pKeyStream64);
+ pIn64_0 += 8;
+ pOut64_0 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr2;
+ asm_XorKeyStream64B_avx(pIn64_1, pOut64_1, pKeyStream64);
+ pIn64_1 += 8;
+ pOut64_1 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr3;
+ asm_XorKeyStream64B_avx(pIn64_2, pOut64_2, pKeyStream64);
+ pIn64_2 += 8;
+ pOut64_2 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr4;
+ asm_XorKeyStream64B_avx(pIn64_3, pOut64_3, pKeyStream64);
+ pIn64_3 += 8;
+ pOut64_3 += 8;
+
+ /* Update keystream count */
+ numKeyStreamsPerPkt--;
+
+ }
+
+ /* process each packet separately for the remaining bytes */
+ for (i = 0; i < 4; i++) {
+ if (remainBytes[i]) {
+ /* need to copy the zuc state to single packet state */
+ singlePktState.lfsrState[0] = state.lfsrState[0][i];
+ singlePktState.lfsrState[1] = state.lfsrState[1][i];
+ singlePktState.lfsrState[2] = state.lfsrState[2][i];
+ singlePktState.lfsrState[3] = state.lfsrState[3][i];
+ singlePktState.lfsrState[4] = state.lfsrState[4][i];
+ singlePktState.lfsrState[5] = state.lfsrState[5][i];
+ singlePktState.lfsrState[6] = state.lfsrState[6][i];
+ singlePktState.lfsrState[7] = state.lfsrState[7][i];
+ singlePktState.lfsrState[8] = state.lfsrState[8][i];
+ singlePktState.lfsrState[9] = state.lfsrState[9][i];
+ singlePktState.lfsrState[10] = state.lfsrState[10][i];
+ singlePktState.lfsrState[11] = state.lfsrState[11][i];
+ singlePktState.lfsrState[12] = state.lfsrState[12][i];
+ singlePktState.lfsrState[13] = state.lfsrState[13][i];
+ singlePktState.lfsrState[14] = state.lfsrState[14][i];
+ singlePktState.lfsrState[15] = state.lfsrState[15][i];
+
+ singlePktState.fR1 = state.fR1[i];
+ singlePktState.fR2 = state.fR2[i];
+
+ singlePktState.bX0 = state.bX0[i];
+ singlePktState.bX1 = state.bX1[i];
+ singlePktState.bX2 = state.bX2[i];
+ singlePktState.bX3 = state.bX3[i];
+
+ numKeyStreamsPerPkt = remainBytes[i] / ZUC_KEYSTR_LEN;
+ numBytesLeftOver = remainBytes[i] % ZUC_KEYSTR_LEN;
+
+ pTempBufInPtr = pBufferIn[i];
+ pTempBufOutPtr = pBufferOut[i];
+
+ /* update the output and input pointers here to point
+ * to the i'th buffers */
+ pOut64_0 = (uint64_t *) &pTempBufOutPtr[length[i] -
+ remainBytes[i]];
+ pIn64_0 = (const uint64_t *) &pTempBufInPtr[length[i] -
+ remainBytes[i]];
+
+ while (numKeyStreamsPerPkt--) {
+ /* Generate the key stream 64 bytes at a time */
+ asm_ZucGenKeystream64B((uint32_t *) keyStr1,
+ &singlePktState);
+ pKeyStream64 = (uint64_t *) keyStr1;
+ asm_XorKeyStream64B_avx(pIn64_0, pOut64_0,
+ pKeyStream64);
+ pIn64_0 += 8;
+ pOut64_0 += 8;
+ }
+
+
+ /* Check for remaining 0 to 63 bytes */
+ if (numBytesLeftOver) {
+ asm_ZucGenKeystream64B((uint32_t *) &keyStr1,
+ &singlePktState);
+ uint32_t offset = length[i] - numBytesLeftOver;
+
+ /* copy the remaining bytes into temporary
+ * buffer and XOR with the 64-bytes of
+ * keystream. Then copy on the valid bytes back
+ * to the output buffer */
+ memcpy(&tempSrc[0], &pTempBufInPtr[offset],
+ numBytesLeftOver);
+ memset(&tempSrc[numBytesLeftOver], 0,
+ 64 - numBytesLeftOver);
+
+ pKeyStream64 = (uint64_t *) &keyStr1[0];
+ pTempSrc64 = (uint64_t *) &tempSrc[0];
+ pTempDst64 = (uint64_t *) &tempDst[0];
+ asm_XorKeyStream64B_avx(pTempSrc64, pTempDst64,
+ pKeyStream64);
+
+ memcpy(&pTempBufOutPtr[offset],
+ &tempDst[0], numBytesLeftOver);
+ }
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(keyStr1, sizeof(keyStr1));
+ clear_mem(keyStr2, sizeof(keyStr2));
+ clear_mem(keyStr3, sizeof(keyStr3));
+ clear_mem(keyStr4, sizeof(keyStr4));
+ clear_mem(&singlePktState, sizeof(singlePktState));
+ clear_mem(&state, sizeof(state));
+ clear_mem(&keys, sizeof(keys));
+ clear_mem(&ivs, sizeof(ivs));
+#endif
+}
+
+void zuc_eea3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t length)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length < ZUC_MIN_LEN || length > ZUC_MAX_LEN)
+ return;
+#endif
+ _zuc_eea3_1_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void zuc_eea3_4_buffer_avx(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t length[4])
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ unsigned int i;
+
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL || length == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ if (pKey[i] == NULL || pIv[i] == NULL ||
+ pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+ return;
+ }
+#endif
+
+ _zuc_eea3_4_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
+ const void * const pBufferIn[], void *pBufferOut[],
+ const uint32_t length[],
+ const uint32_t numBuffers)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+
+ unsigned int i;
+ unsigned int packetCount = numBuffers;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL || length == NULL)
+ return;
+
+ for (i = 0; i < numBuffers; i++) {
+ if (pKey[i] == NULL || pIv[i] == NULL ||
+ pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+ return;
+ }
+#endif
+ i = 0;
+
+ while(packetCount >= 4) {
+ packetCount -=4;
+ _zuc_eea3_4_buffer_avx(&pKey[i],
+ &pIv[i],
+ &pBufferIn[i],
+ &pBufferOut[i],
+ &length[i]);
+ i+=4;
+ }
+
+ while(packetCount--) {
+ _zuc_eea3_1_buffer_avx(pKey[i],
+ pIv[i],
+ pBufferIn[i],
+ pBufferOut[i],
+ length[i]);
+ i++;
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+static inline uint64_t rotate_left(uint64_t u, size_t r)
+{
+ return (((u) << (r)) | ((u) >> (64 - (r))));
+}
+
+static inline uint64_t load_uint64(const void *ptr)
+{
+ return *((const uint64_t *)ptr);
+}
+
+void zuc_eia3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ const uint32_t lengthInBits,
+ uint32_t *pMacI)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+ DECLARE_ALIGNED(ZucState_t zucState, 64);
+ DECLARE_ALIGNED(uint32_t keyStream[16 * 2], 64);
+ const uint32_t keyStreamLengthInBits = ZUC_KEYSTR_LEN * 8;
+ /* generate a key-stream 2 words longer than the input message */
+ const uint32_t N = lengthInBits + (2 * ZUC_WORD);
+ uint32_t L = (N + 31) / ZUC_WORD;
+ uint32_t *pZuc = (uint32_t *) &keyStream[0];
+ uint32_t remainingBits = lengthInBits;
+ uint32_t T = 0;
+ const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBits < ZUC_MIN_LEN || lengthInBits > ZUC_MAX_LEN)
+ return;
+#endif
+
+ memset(&zucState, 0, sizeof(ZucState_t));
+
+ asm_ZucInitialization(pKey, pIv, &(zucState));
+ asm_ZucGenKeystream64B(pZuc, &zucState);
+
+ /* loop over the message bits */
+ while (remainingBits >= keyStreamLengthInBits) {
+ remainingBits -= keyStreamLengthInBits;
+ L -= (keyStreamLengthInBits / 32);
+ /* Generate the next key stream 8 bytes or 64 bytes */
+ if (!remainingBits)
+ asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+ else
+ asm_ZucGenKeystream64B(&keyStream[16], &zucState);
+ T = asm_Eia3Round64BAVX(T, &keyStream[0], pIn8);
+ memcpy(&keyStream[0], &keyStream[16], 16 * sizeof(uint32_t));
+ pIn8 = &pIn8[ZUC_KEYSTR_LEN];
+ }
+
+ /*
+ * If remaining bits has more than 14 ZUC WORDS (double words),
+ * keystream needs to have up to another 2 ZUC WORDS (8B)
+ */
+ if (remainingBits > (14 * 32))
+ asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+ T ^= asm_Eia3RemainderAVX(&keyStream[0], pIn8, remainingBits);
+ T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]),
+ remainingBits % 32);
+
+ /* save the final MAC-I result */
+ uint32_t keyBlock = keyStream[L - 1];
+ *pMacI = bswap4(T ^ keyBlock);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data (in registers and stack) */
+ clear_mem(keyStream, sizeof(keyStream));
+ clear_mem(&zucState, sizeof(zucState));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}